base_diff.patch

diff --color -r runtime/megatron/arguments.py ../Megatron-LM-base/megatron/arguments.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/arguments.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
27,28d18
< import torch
< import json 
30c20,21
< DEBUG_FIX_WEIGHT = os.environ.get("DEBUG_FIX_WEIGHT", '0') == '1'
---
> 
> import torch
54,57d44
<     # Aceso arguments
<     parser = _add_flexpipe_args(parser)
<     parser = _add_profiler_args(parser)
< 
71,128c58,75
< 
<     if args.prof_tp_size is not None:
<         args.global_batch_size = 1 
<         args.micro_batch_size = 1
<         args.num_ops_in_each_stage = [1]
<         args.virtual_pipeline_model_parallel_size = 1
<         args.model_parallel_size_of_each_op = [[args.prof_tp_size]]
<         args.data_parallel_size_of_each_op = [[1]]
<         args.model_name = ""
<         args.resharding_stages = [True]
< 
<         if len(args.prof_repeat_times) > 1:
<             assert args.prof_repeat_threshold is not None, "when args.prof_repeat_times is a list, a threshold is required."
<         _print_args(args)
<         return args
<     else:
<         assert args.flexpipe_config is not None, "An Aceso config should be provided."
<         args.log_name = args.flexpipe_config.split("/")[-1].split(".json")[0]
< 
<     with open(args.flexpipe_config, "r") as f:
<         config_dict = json.load(f)
< 
<     args.model_name = config_dict["model_name"]
<     args.global_batch_size = config_dict["global_batch_size"]
<     args.micro_batch_size = config_dict["micro_batch_size"]
<     args.num_layers = config_dict["num_layers"]
< 
<     if args.model_name in ["gpt"]:
<         args.num_attention_heads = config_dict["num_attention_heads"]
<         args.hidden_size = config_dict["hidden_size"]
<         args.max_position_embeddings = config_dict["max_position_embeddings"]
<         args.seq_length = config_dict["seq_length"]
<     elif args.model_name in ["resnet"]:
<         args.in_channels = config_dict["in_channels"]
<         args.width_factor = config_dict["width_factor"]
<     elif args.model_name in ["t5"]:
<         args.encoder_seq_length = config_dict["encoder_seq_length"]
<         args.decoder_seq_length = config_dict["decoder_seq_length"]
<         args.seq_length = config_dict["encoder_seq_length"]
<         args.max_position_embeddings = config_dict["max_position_embeddings"]
<         args.num_attention_heads = config_dict["num_attention_heads"]
<         args.kv_channels = config_dict["kv_channels"]
<         args.hidden_size = config_dict["hidden_size"]
<         args.ffn_hidden_size = config_dict["ffn_hidden_size"]
<              
<     args.num_ops_in_each_stage = config_dict["num_ops_in_each_stage"]
<     args.num_gpus = config_dict["num_gpus"]
<     args.num_stages = config_dict["num_stages"]
<     args.algo_of_each_op = config_dict["algo_of_each_op"]
<     args.model_parallel_size_of_each_op = config_dict["model_parallel_size_of_each_op"]
<     args.data_parallel_size_of_each_op = config_dict["data_parallel_size_of_each_op"]
<     args.recompute_ops = config_dict["recompute_ops"]
<     args.resharding_stages = config_dict["resharding_stages"]
<     args.checkpoint_activations = config_dict["checkpoint_activations"]
< 
<     assert args.world_size == sum(args.num_gpus), \
<         'number of GPUs should be equal to sum(mp_size * dp_size)'
<     
---
>     # Tensor model parallel size.
>     args.tensor_model_parallel_size = min(
>         args.tensor_model_parallel_size, args.world_size)
>     assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
>         ' ({}) is not divisible by tensor model parallel size ({})'.format(
>             args.world_size, args.tensor_model_parallel_size)
>     # Pipeline model parallel size.
>     args.pipeline_model_parallel_size = min(
>         args.pipeline_model_parallel_size,
>         (args.world_size // args.tensor_model_parallel_size))
>     # Checks.
>     model_parallel_size = args.pipeline_model_parallel_size * \
>                           args.tensor_model_parallel_size
>     assert args.world_size % model_parallel_size == 0, 'world size is not'\
>         ' divisible by tensor parallel size ({}) times pipeline parallel ' \
>         'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
>                            args.pipeline_model_parallel_size)
>     args.data_parallel_size = args.world_size // model_parallel_size
130,137c77,93
<         print('[FlexPipe] using world size: {}, data-parallel-size: {}, '
<             'tensor-model-parallel size: {}, '
<             'pipeline-model-parallel size: {} '
<             'interleave-factor: {}'.format(
<                 args.world_size, args.data_parallel_size_of_each_op,
<                 args.model_parallel_size_of_each_op,
<                 args.pipeline_model_parallel_size,
<                 args.interleave_factor), flush=True)
---
>         print('using world size: {}, data-parallel-size: {}, '
>               'tensor-model-parallel size: {}, '
>               'pipeline-model-parallel size: {} '.format(
>                   args.world_size, args.data_parallel_size,
>                   args.tensor_model_parallel_size,
>                   args.pipeline_model_parallel_size), flush=True)
> 
>     # Deprecated arguments
>     assert args.batch_size is None, '--batch-size argument is no longer ' \
>         'valid, use --micro-batch-size instead'
>     del args.batch_size
>     assert args.warmup is None, '--warmup argument is no longer valid, use ' \
>         '--lr-warmup-fraction instead'
>     del args.warmup
>     assert args.model_parallel_size is None, '--model-parallel-size is no ' \
>         'longer valid, use --tensor-model-parallel-size instead'
>     del args.model_parallel_size
153a110
>     assert args.micro_batch_size is not None
155,162c112,129
<     dp_size_list = []
<     for i in range(args.num_stages):
<         dp_size_list += args.data_parallel_size_of_each_op[i]
<     for i in range(len(dp_size_list)):
<         assert args.micro_batch_size % dp_size_list[i] == 0
<     assert args.global_batch_size % args.micro_batch_size == 0
< 
<     args.virtual_pipeline_model_parallel_size = args.interleave_factor
---
>     if args.global_batch_size is None:
>         args.global_batch_size = args.micro_batch_size * args.data_parallel_size
>         if args.rank == 0:
>             print('setting global batch size to {}'.format(
>                 args.global_batch_size), flush=True)
>     assert args.global_batch_size > 0
>     if args.num_layers_per_virtual_pipeline_stage is not None:
>         assert args.pipeline_model_parallel_size > 2, \
>             'pipeline-model-parallel size should be greater than 2 with ' \
>             'interleaved schedule'
>         assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
>             'number of layers is not divisible by number of layers per virtual ' \
>             'pipeline stage'
>         args.virtual_pipeline_model_parallel_size = \
>             (args.num_layers // args.pipeline_model_parallel_size) // \
>             args.num_layers_per_virtual_pipeline_stage
>     else:
>         args.virtual_pipeline_model_parallel_size = None
227a195,200
>     # Check required arguments.
>     required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
>                      'max_position_embeddings']
>     for req_arg in required_args:
>         _check_arg_is_not_none(args, req_arg)
> 
229,242c202,214
<     if args.model_name in ["gpt"]:
<         if args.ffn_hidden_size is None:
<             args.ffn_hidden_size = 4 * args.hidden_size
< 
<         if args.kv_channels is None:
<             assert args.hidden_size % args.num_attention_heads == 0
<             args.kv_channels = args.hidden_size // args.num_attention_heads
< 
<         if args.seq_length is not None:
<             assert args.encoder_seq_length is None
<             args.encoder_seq_length = args.seq_length
<         else:
<             assert args.encoder_seq_length is not None
<             args.seq_length = args.encoder_seq_length
---
>     if args.ffn_hidden_size is None:
>         args.ffn_hidden_size = 4 * args.hidden_size
> 
>     if args.kv_channels is None:
>         assert args.hidden_size % args.num_attention_heads == 0
>         args.kv_channels = args.hidden_size // args.num_attention_heads
> 
>     if args.seq_length is not None:
>         assert args.encoder_seq_length is None
>         args.encoder_seq_length = args.seq_length
>     else:
>         assert args.encoder_seq_length is not None
>         args.seq_length = args.encoder_seq_length
244,247c216,219
<         if args.seq_length is not None:
<             assert args.max_position_embeddings >= args.seq_length
<         if args.decoder_seq_length is not None:
<             assert args.max_position_embeddings >= args.decoder_seq_length
---
>     if args.seq_length is not None:
>         assert args.max_position_embeddings >= args.seq_length
>     if args.decoder_seq_length is not None:
>         assert args.max_position_embeddings >= args.decoder_seq_length
264,268d235
<     # set dropout = 0 when DEBUG_FIX_WEIGHT
<     if DEBUG_FIX_WEIGHT:
<         args.attention_dropout = 0
<         args.hidden_dropout = 0
< 
294a262,265
>     group.add_argument('--num-layers', type=int, default=None,
>                        help='Number of transformer layers.')
>     group.add_argument('--hidden-size', type=int, default=None,
>                        help='Tansformer hidden size.')
297a269,270
>     group.add_argument('--num-attention-heads', type=int, default=None,
>                        help='Number of transformer attention heads.')
302a276,278
>     group.add_argument('--max-position-embeddings', type=int, default=None,
>                        help='Maximum number of position embeddings to use. '
>                        'This is the size of position embedding.')
387a364,367
>     group.add_argument('--micro-batch-size', type=int, default=None,
>                        help='Batch size per model instance (local batch size). '
>                        'Global batch size is local batch size times data '
>                        'parallel size times number of micro batches.')
390a371,377
>     group.add_argument('--global-batch-size', type=int, default=None,
>                        help='Training batch size. If set, it should be a '
>                        'multiple of micro-batch-size times data-parallel-size. '
>                        'If this value is None, then '
>                        'use micro-batch-size * data-parallel-size as the '
>                        'global batch size. This choice will result in 1 for '
>                        'number of micro-batches.')
609,615d595
< 
<     group.add_argument('--empty-unused-memory-level', default=0, type=int,
<                        choices=[0, 1, 2],
<                        help='Call torch.cuda.empty_cache() each iteration '
<                        '(training and eval), to reduce fragmentation.'
<                        '0=off, 1=moderate, 2=aggressive.')
< 
651a632,633
>     group.add_argument('--seq-length', type=int, default=None,
>                        help='Maximum sequence length to process.')
773,805d754
< 
< def _add_flexpipe_args(parser):
<     group = parser.add_argument_group(title='flexpipe')
<     group.add_argument('--flexpipe-config', type=str, default=None,
<                        help='Path to flexpipe configuration.')
<     group.add_argument('--interleave-factor', type=int, default=1,
<                        help='# of interleaved virtual stages in one physical stage.')                                      
<     group.add_argument('--checkpoint-stages', nargs='+', default=[], 
<                        help="An array of 1/0 to indicate if this stage will be activation checkpointed.")  
<     group.add_argument('--log-path', type=str, default="./", help='')          
<     return parser
< 
< def _add_profiler_args(parser):
<     group = parser.add_argument_group(title='flexpipe_profiler')
< 
<     group.add_argument('--prof-tp-size', type=int, default=None, help='Profiler tp size.')
<     group.add_argument('--prof-path', type=str, default=None, help='')
<     group.add_argument('--prof-cache-file', type=str, default=None, help='')
<     group.add_argument('--prof-model-name', type=str, default='all', help='')
<     group.add_argument('--prof-model-size', type=str, default='all', help='')
<     group.add_argument('--prof-time-only', action='store_true', help='')
<     group.add_argument('--prof-memory-only', action='store_true', help='')
<     group.add_argument('--prof-warmup-times', type=int, default=20, help='')
<     group.add_argument('--prof-repeat-times', nargs='+', type=int, default=[50], help='')
<     group.add_argument('--prof-warmup-threshold', type=int, default=None, help='')
<     group.add_argument('--prof-repeat-threshold', type=int, default=None, help='')
<     group.add_argument('--prof-skip-running', action='store_true', help='')
<     group.add_argument('--prof-num-nodes', type=int, default=None, help='')
<     group.add_argument('--prof-node-rank', type=int, default=None, help='')
<     group.add_argument('--prof-ref-data', type=str, default=None, help='')
<     group.add_argument('--prof-mbs-list', nargs='+', type=int, default=None, help='')
< 
<     return parser
\ No newline at end of file
diff --color -r runtime/megatron/data/dataset_utils.py ../Megatron-LM-base/megatron/data/dataset_utils.py
705,709c705,707
< 
<     ## Temporarily bypass the check in Aceso
<     # assert counts[0].item() == (
<     #     torch.distributed.get_world_size() //
<     #     torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
---
>     assert counts[0].item() == (
>         torch.distributed.get_world_size() //
>         torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
diff --color -r runtime/megatron/data/gpt_dataset.py ../Megatron-LM-base/megatron/data/gpt_dataset.py
302,306c302,304
< 
<     ## Temporarily bypass the check in Aceso
<     # assert counts[0].item() == (
<     #     torch.distributed.get_world_size() //
<     #     torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
---
>     assert counts[0].item() == (
>         torch.distributed.get_world_size() //
>         torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
diff --color -r runtime/megatron/fused_kernels/__init__.py ../Megatron-LM-base/megatron/fused_kernels/__init__.py
81,86d80
<         # Softmax
<         sources=[srcpath / 'scaled_softmax.cpp',
<                  srcpath / 'scaled_softmax_cuda.cu']
<         scaled_softmax_cuda = _cpp_extention_load_helper(
<             "scaled_softmax_cuda", sources, extra_cuda_flags)
< 
diff --color -r runtime/megatron/fused_kernels/layer_norm_cuda_kernel.cu ../Megatron-LM-base/megatron/fused_kernels/layer_norm_cuda_kernel.cu
24c24
< #include "ATen/cuda/DeviceUtils.cuh"
---
> #include <THC/THCDeviceUtils.cuh>
332d331
<     __syncthreads();
648,649d646
<     // prevent race where buf is written again before reads are done
<     __syncthreads();
diff --color -r runtime/megatron/fused_kernels/scaled_masked_softmax.cpp ../Megatron-LM-base/megatron/fused_kernels/scaled_masked_softmax.cpp
35,40d34
< int get_batch_per_block_cuda(
<     int query_seq_len,
<     int key_seq_len,
<     int batches,
<     int attn_heads);
< 
72,79d65
< int get_batch_per_block(
<     int query_seq_len,
<     int key_seq_len,
<     int batches,
<     int attn_heads) {
<     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
< }
< 
88,89c74
< 
<   m.def("backward",
---
>   m.def("backward", 
92,96d76
< 
<   m.def("get_batch_per_block",
<         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
<         "Return Batch per block size."
<   );
diff --color -r runtime/megatron/fused_kernels/scaled_masked_softmax_cuda.cu ../Megatron-LM-base/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
31,35d30
< int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
<     return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
< }
< 
< 
47c42
<   TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
---
>   TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
diff --color -r runtime/megatron/fused_kernels/scaled_masked_softmax.h ../Megatron-LM-base/megatron/fused_kernels/scaled_masked_softmax.h
93,203d92
< 
< /*
<  * Extended softmax (from native aten pytorch) with following additional features
<  * 1) input scaling
<  */	
< template <typename input_t, typename output_t, typename acc_t, int log2_elements>
< __global__ void scaled_softmax_warp_forward(
<     output_t *dst, 
<     const input_t *src,
<     const acc_t scale, 
<     int micro_batch_size, 
<     int element_count)
< {
<     // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
<     // warp_size of method warp_softmax_forward_kernel.
<     constexpr int next_power_of_two = 1 << log2_elements;
<     constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
<     constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
<     constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
<     constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
< 
<     // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
<     // gridDim/blockIdx = (seq_len, attn_heads, batches) 
<     int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
< 
<     // micro_batch_size might not be a multiple of WARP_BATCH. Check how
<     // many batches have to computed within this WARP.
<     int local_batches = micro_batch_size - first_batch;
<     if (local_batches > WARP_BATCH)
<         local_batches = WARP_BATCH;
< 
<     // there might be multiple batches per warp. compute the index within the batch
<     int local_idx = threadIdx.x;
< 
<     src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
<     dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
< 
<     // load data from global memory
<     acc_t elements[WARP_BATCH][WARP_ITERATIONS];
<     input_t temp_data[ELEMENTS_PER_LDG_STG];
<     #pragma unroll
<     for (int i = 0;  i < WARP_BATCH;  ++i) {
<         int batch_element_count = (i >= local_batches) ? 0 : element_count;
< 
<         #pragma unroll
<         for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
<             int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
< 
<             if (element_index < batch_element_count) {
<                 int itr_idx = i*element_count+it*WARP_SIZE;
<                 copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
< 
<                 #pragma unroll
<                 for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
<                     elements[i][it + element] = (acc_t)temp_data[element] * scale;
<                 }
<             } else {
<                 #pragma unroll
<                 for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
<                     elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
<                 }
<             }
<         }
<     }
< 
<     // compute max_value
<     acc_t max_value[WARP_BATCH];
<     #pragma unroll
<     for (int i = 0;  i < WARP_BATCH;  ++i) {
<         max_value[i] = elements[i][0];
<         #pragma unroll
<         for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
<             max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
<         }
<     }
<     warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
< 
<     acc_t sum[WARP_BATCH] { 0.0f };
<     #pragma unroll
<     for (int i = 0;  i < WARP_BATCH;  ++i) {
<         #pragma unroll
<         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
<             elements[i][it] = std::exp((elements[i][it] - max_value[i]));
<             sum[i] += elements[i][it];
<         }
<     }
<     warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
< 
<     // store result
<     output_t out[ELEMENTS_PER_LDG_STG];
<     #pragma unroll
<     for (int i = 0;  i < WARP_BATCH;  ++i) {
<         if (i >= local_batches)
<             break;
<         #pragma unroll
<         for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
<             int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
<             if (element_index < element_count) {
<                 #pragma unroll
<                 for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
<                     out[element] = elements[i][it + element] / sum[i];
<                 }
<                 copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
<             } else {
<                 break;
<             } 
<         }
<     }
< }
< 
< 
225c114
<     constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
---
>     constexpr int ELEMENTS_PER_LDG_STG = 4;
344c233
<     constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
---
>     constexpr int ELEMENTS_PER_LDG_STG = 4;
424,465d312
< } // end of anonymous namespace
< 
< int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){
<     int log2_elements = log2_ceil(key_seq_len);
<     const int next_power_of_two = 1 << log2_elements;
< 
<     int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
<     int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
< 
<     constexpr int threads_per_block = 128;
<     int warps_per_block = (threads_per_block / warp_size);
<     int batches_per_block = warps_per_block * batches_per_warp;
< 
<     return batches_per_block;
< }
< 
< template<typename input_t, typename output_t, typename acc_t>
< void dispatch_scaled_softmax_forward(
<     output_t *dst, 
<     const input_t *src, 
<     const input_t scale, 
<     int query_seq_len, 
<     int key_seq_len, 
<     int batches,
<     int attn_heads)
< {
<     TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
<     if (key_seq_len == 0) {
<         return;
<     } else {
<         int log2_elements = log2_ceil(key_seq_len);
<         const int next_power_of_two = 1 << log2_elements;
<         int batch_count = batches * attn_heads * query_seq_len;
< 
<         // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
<         int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
< 
<         // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
<         int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
< 
<         // use 128 threads per block to maximimize gpu utilization
<         constexpr int threads_per_block = 128;
467,530c314
<         int warps_per_block = (threads_per_block / warp_size);
<         int batches_per_block = warps_per_block * batches_per_warp;
<         TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
<         dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
<         dim3 threads(warp_size, warps_per_block, 1);
<         // Launch code would be more elegant if C++ supported FOR CONSTEXPR
<         switch (log2_elements) {
<             case 0: // 1
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 0>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 1: // 2
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 1>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 2: // 4
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 2>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 3: // 8
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 3>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 4: // 16
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 4>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 5: // 32
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 5>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 6: // 64
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 6>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 7: // 128
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 7>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 8: // 256
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 8>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 9: // 512
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 9>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 10: // 1024
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 10>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 11: // 2048
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 11>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             case 12: // 4096
<                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 12>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
<                 break;
<             default:
<                 break;
<         }
<     }
< }
---
> } // end of anonymous namespace
544c328
<     TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
---
>     TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 2048 );
616,619d399
<             case 12: // 4096
<                 scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
<                 break;
637c417
<     TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 4096 );
---
>     TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 2048 );
708,712d487
< 			case 12: // 4096
<                 scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
<                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
<                 break;
< 
Only in runtime/megatron/fused_kernels: scaled_softmax.cpp
Only in runtime/megatron/fused_kernels: scaled_softmax_cuda.cu
diff --color -r runtime/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h ../Megatron-LM-base/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
128c128
<     constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
---
>     constexpr int ELEMENTS_PER_LDG_STG = 4;
248c248
<     constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
---
>     constexpr int ELEMENTS_PER_LDG_STG = 4;
364d363
< 
455d453
< 
Only in runtime/megatron/fused_kernels: tests
diff --color -r runtime/megatron/global_vars.py ../Megatron-LM-base/megatron/global_vars.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/global_vars.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
43d34
< PROFILING = os.environ.get("PROFILING", '1') == '1'
190a182
> 
199d190
<         self.logged_times = 0
203,219c194
<         if PROFILING:
<             assert not self.started_, f"{self.name_} timer has already been started"
<             torch.cuda.synchronize()
<             self.start_time = time.time()
<             self.started_ = True
< 
<     def stop(self):
<         """Stop the timer."""
<         if PROFILING:
<             assert self.started_, 'timer is not started'
<             torch.cuda.synchronize()
<             self.elapsed_ += (time.time() - self.start_time)
<             self.started_ = False
< 
<     def start_v2(self):
<         """Start the timer."""
<         assert not self.started_, f"{self.name_} timer has already been started"
---
>         assert not self.started_, 'timer has already been started'
224c199
<     def stop_v2(self):
---
>     def stop(self):
226c201
<         assert self.started_, f'{self.name_} timer is not started'
---
>         assert self.started_, 'timer is not started'
229,230c204
<         self.started_ = False    
<         self.logged_times += 1        
---
>         self.started_ = False
236d209
<         self.logged_times = 0
278,282d250
< 
<         from megatron import mpu
<         from megatron.utils import report_memory
<         from megatron.utils import debug_mem_report
< 
284,287c252
<         time_to_csv = [[],[]]
<         string = f'\n==> Time (ms) | [stage {mpu.get_pipeline_model_parallel_rank()}, virtual {mpu.get_virtual_pipeline_model_parallel_rank()}, rank {torch.distributed.get_rank()}]'
<         string_ops = f'\n==> OP Time (us) | [stage {mpu.get_pipeline_model_parallel_rank()}, virtual {mpu.get_virtual_pipeline_model_parallel_rank()}, rank {torch.distributed.get_rank()}]'
<         string_mem, mem_to_csv = report_memory(f"\n==> Memory | [stage {mpu.get_pipeline_model_parallel_rank()}, virtual {mpu.get_virtual_pipeline_model_parallel_rank()}, rank {torch.distributed.get_rank()}]", get_list=True)
---
>         string = 'time (ms)'
289,301c254,256
<             logged_times = self.timers[name].logged_times
<             if logged_times > 0:
<                 elapsed_time = self.timers[name].elapsed(
<                     reset=reset) * 1000000.0 / logged_times
<                 string_ops += ' | {}: {:.2f}'.format(name, elapsed_time)
<             else:                   
<                 elapsed_time = self.timers[name].elapsed(
<                     reset=reset) * 1000.0 / normalizer
<                 string += ' | {}: {:.2f}'.format(name, elapsed_time)
<             time_to_csv[0].append(name)
<             time_to_csv[1].append(f"{elapsed_time:.2f}")
<         time_to_csv[0] += mem_to_csv[0]
<         time_to_csv[1] += mem_to_csv[1]
---
>             elapsed_time = self.timers[name].elapsed(
>                 reset=reset) * 1000.0 / normalizer
>             string += ' | {}: {:.2f}'.format(name, elapsed_time)
303,304c258,260
<             print(string, flush=True)
<             print(string_mem, flush=True)               
---
>             if torch.distributed.get_rank() == (
>                     torch.distributed.get_world_size() - 1):
>                 print(string, flush=True)
307,308d262
< 
<         return time_to_csv
\ No newline at end of file
diff --color -r runtime/megatron/initialize.py ../Megatron-LM-base/megatron/initialize.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/initialize.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
122,142c114,129
<     if args.model_name in ["gpt"]:
<         seq_len = args.seq_length
< 
<         ## Temporarily bypass the check in Aceso
<         attn_batch_size = 4
<         # attn_batch_size = \
<         #     (args.num_attention_heads / args.tensor_model_parallel_size) * \
<         #     args.micro_batch_size
< 
<         # Constraints on sequence length and attn_batch_size to enable warp based
<         # optimization and upper triangular optimization (for causal mask)
<         custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \
<             seq_len % 4 == 0 and attn_batch_size % 4 == 0
<         # Print a warning.
<         if not ((args.fp16 or args.bf16) and
<                 custom_kernel_constraint and
<                 args.masked_softmax_fusion):
<             if args.rank == 0:
<                 print('WARNING: constraints for invoking optimized'
<                     ' fused softmax kernel are not met. We default'
<                     ' back to unfused kernel invocations.', flush=True)
---
>     seq_len = args.seq_length
>     attn_batch_size = \
>         (args.num_attention_heads / args.tensor_model_parallel_size) * \
>         args.micro_batch_size
>     # Constraints on sequence length and attn_batch_size to enable warp based
>     # optimization and upper triangular optimization (for causal mask)
>     custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \
>         seq_len % 4 == 0 and attn_batch_size % 4 == 0
>     # Print a warning.
>     if not ((args.fp16 or args.bf16) and
>             custom_kernel_constraint and
>             args.masked_softmax_fusion):
>         if args.rank == 0:
>             print('WARNING: constraints for invoking optimized'
>                   ' fused softmax kernel are not met. We default'
>                   ' back to unfused kernel invocations.', flush=True)
203d189
<     # Aceso: use different initialization function
208c194,196
<             mpu.initialize_model_parallel_flexpipe()
---
>             mpu.initialize_model_parallel(args.tensor_model_parallel_size,
>                                           args.pipeline_model_parallel_size,
>                                           args.virtual_pipeline_model_parallel_size)
diff --color -r runtime/megatron/microbatches.py ../Megatron-LM-base/megatron/microbatches.py
27,31c27,28
<             args.global_batch_size, args.micro_batch_size, 1)
< 
<         # num_microbatches_calculator = ConstantNumMicroBatches(
<         #     args.global_batch_size, args.micro_batch_size,
<         #     args.data_parallel_size)            
---
>             args.global_batch_size, args.micro_batch_size,
>             args.data_parallel_size)
36,53c33,50
<     # else:
<     #     assert len(args.rampup_batch_size) == 3, 'expected the following ' \
<     #         'format: --rampup-batch-size <start batch size> ' \
<     #         '<batch size incerement> <ramp-up samples>'
<     #     start_batch_size = int(args.rampup_batch_size[0])
<     #     batch_size_increment = int(args.rampup_batch_size[1])
<     #     ramup_samples = int(args.rampup_batch_size[2])
<     #     if args.rank == 0:
<     #         print('will use batch size rampup starting from global batch '
<     #               'size {} to global batch size {} with batch size increments '
<     #               '{} over {} samples.'.format(start_batch_size,
<     #                                            args.global_batch_size,
<     #                                            batch_size_increment,
<     #                                            ramup_samples), flush=True)
<     #     num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
<     #         start_batch_size, batch_size_increment, ramup_samples,
<     #         args.global_batch_size, args.micro_batch_size,
<     #         args.data_parallel_size)
---
>     else:
>         assert len(args.rampup_batch_size) == 3, 'expected the following ' \
>             'format: --rampup-batch-size <start batch size> ' \
>             '<batch size incerement> <ramp-up samples>'
>         start_batch_size = int(args.rampup_batch_size[0])
>         batch_size_increment = int(args.rampup_batch_size[1])
>         ramup_samples = int(args.rampup_batch_size[2])
>         if args.rank == 0:
>             print('will use batch size rampup starting from global batch '
>                   'size {} to global batch size {} with batch size increments '
>                   '{} over {} samples.'.format(start_batch_size,
>                                                args.global_batch_size,
>                                                batch_size_increment,
>                                                ramup_samples), flush=True)
>         num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
>             start_batch_size, batch_size_increment, ramup_samples,
>             args.global_batch_size, args.micro_batch_size,
>             args.data_parallel_size)
Only in ../Megatron-LM-base/megatron/model: bert_model.py
Only in ../Megatron-LM-base/megatron/model: biencoder_model.py
Only in ../Megatron-LM-base/megatron/model: classification.py
diff --color -r runtime/megatron/model/distributed.py ../Megatron-LM-base/megatron/model/distributed.py
25,26d24
< from megatron.utils import unwrap_model
< from .module import Float16Module
28,29c26
< import os
< LOG_NAME = os.environ.get("LOG_NAME", None)
---
> 
148,149c145
<             # store the start index for the gradients.     
< 
---
>             # store the start index for the gradients.
170,173c166
<         
<         args = get_args()
<         rank_in_pipeline = mpu.get_pipeline_model_parallel_rank()
<         self.resharding = args.resharding_stages[rank_in_pipeline]
---
> 
194c187
<     ## TODO: continious buffer with resharding.
---
> 
198d190
<         # args = get_args()
200,201d191
<             if self.resharding:
<                 raise RuntimeError("cross-op resharding with continues buffer is not supported yet.")
207,309c197,218
<             if self.resharding:
<                 # Otherwise, bucketize and all-reduce
<                 buckets = {}
<                 dp_groups = {}
<                 dp_sizes = {}
<                 # Pack the buckets.
<                 model_ = unwrap_model(self.module, (Float16Module)) 
<                 for op in model_.language_model.ops:
<                     tp_size = op.tp_size
<                     dp_size = op.dp_size
<                     for param in op.parameters():
<                         if param.requires_grad and param.grad is not None:
<                             data_type = param.data.type()
<                             key_str = str(data_type)+str(tp_size)+str(dp_size)
<                             if key_str not in buckets:
<                                 buckets[key_str] = []
<                             buckets[key_str].append(param)
<                             param.main_grad = param.grad
< 
<                             if key_str not in dp_groups:
<                                 dp_groups[key_str] = mpu.get_data_parallel_group_via_op_index(op.op_index)
<                                 dp_sizes[key_str] = dp_size
< 
<                 # For each bucket, all-reduce and copy all-reduced grads.
<                 for key_str in buckets:
<                     bucket = buckets[key_str]
<                     grads = [param.grad.data for param in bucket]
<                     coalesced = _flatten_dense_tensors(grads)
<                     coalesced /= dp_sizes[key_str]
<                     torch.distributed.all_reduce(
<                         coalesced, group=dp_groups[key_str])
<                     for buf, synced in zip(grads, _unflatten_dense_tensors(
<                             coalesced, grads)):
<                         buf.copy_(synced)
<             else:
<                 # Otherwise, bucketize and all-reduce
<                 buckets = {}
<                 # Pack the buckets.
<                 for param in self.module.parameters():
<                     if param.requires_grad and param.grad is not None:
<                         tp = param.data.type()
<                         if tp not in buckets:
<                             buckets[tp] = []
<                         buckets[tp].append(param)
<                         param.main_grad = param.grad
< 
<                 # For each bucket, all-reduce and copy all-reduced grads.
<                 for tp in buckets:
<                     bucket = buckets[tp]
<                     grads = [param.grad.data for param in bucket]
<                     coalesced = _flatten_dense_tensors(grads)
<                     coalesced /= mpu.get_data_parallel_world_size()
<                     torch.distributed.all_reduce(
<                         coalesced, group=mpu.get_data_parallel_group())
<                     for buf, synced in zip(grads, _unflatten_dense_tensors(
<                             coalesced, grads)):
<                         buf.copy_(synced)                
< 
< 
<     # def allreduce_gradients(self):
<     #     """Reduce gradients across data parallel ranks."""
<     #     # If we have buffers, simply reduce the data in the buffer.
<     #     if self._grad_buffers is not None:
<     #         for _, buffer_ in self._grad_buffers.items():
<     #             buffer_.data /= mpu.get_data_parallel_world_size()
<     #             torch.distributed.all_reduce(
<     #                 buffer_.data, group=mpu.get_data_parallel_group())
<     #     else:
<     #         # Otherwise, bucketize and all-reduce
<     #         buckets = {}
<     #         # Pack the buckets.
<     #         for param in self.module.parameters():
<     #             if param.requires_grad and param.grad is not None:
<     #                 tp = param.data.type()
<     #                 if tp not in buckets:
<     #                     buckets[tp] = []
<     #                 buckets[tp].append(param)
<     #                 param.main_grad = param.grad
< 
<     #         # print(f"[DEBUG] ======> allreduce_gradients <=====")
<     #         # for name, params in self.module.named_parameters():
<     #         #     if params.requires_grad:
<     #         #         if params.grad is not None:
<     #         #             string = f"[DEBUG] param name {name}, requires_grad: {params.requires_grad},\n main_grad: {params.main_grad}"
<     #         #         else:
<     #         #             string = f"[DEBUG] param name {name}, requires_grad: {params.requires_grad},\n grad = None"
<     #         #     else:
<     #         #         string = f"[DEBUG] param name {name}, requires_grad: {params.requires_grad}"
<     #         #     with open(f"{LOG_NAME}_debug_grad_rank_{torch.distributed.get_rank()}.log", "a+") as f:
<     #         #         f.write(string+"\n")  
< 
< 
<     #         # For each bucket, all-reduce and copy all-reduced grads.
<     #         for tp in buckets:
<     #             bucket = buckets[tp]
<     #             grads = [param.grad.data for param in bucket]
<     #             coalesced = _flatten_dense_tensors(grads)
<     #             coalesced /= mpu.get_data_parallel_world_size()
<     #             torch.distributed.all_reduce(
<     #                 coalesced, group=mpu.get_data_parallel_group())
<     #             for buf, synced in zip(grads, _unflatten_dense_tensors(
<     #                     coalesced, grads)):
<     #                 buf.copy_(synced)
---
>             # Otherwise, bucketize and all-reduce
>             buckets = {}
>             # Pack the buckets.
>             for param in self.module.parameters():
>                 if param.requires_grad and param.grad is not None:
>                     tp = param.data.type()
>                     if tp not in buckets:
>                         buckets[tp] = []
>                     buckets[tp].append(param)
>                     param.main_grad = param.grad
> 
>             # For each bucket, all-reduce and copy all-reduced grads.
>             for tp in buckets:
>                 bucket = buckets[tp]
>                 grads = [param.grad.data for param in bucket]
>                 coalesced = _flatten_dense_tensors(grads)
>                 coalesced /= mpu.get_data_parallel_world_size()
>                 torch.distributed.all_reduce(
>                     coalesced, group=mpu.get_data_parallel_group())
>                 for buf, synced in zip(grads, _unflatten_dense_tensors(
>                         coalesced, grads)):
>                     buf.copy_(synced)
Only in runtime/megatron/model: flex_gpt.py
Only in runtime/megatron/model: flex_model.py
Only in runtime/megatron/model: flex_ops.py
Only in runtime/megatron/model: flex_resnet.py
Only in runtime/megatron/model: flex_t5.py
diff --color -r runtime/megatron/model/fused_softmax.py ../Megatron-LM-base/megatron/model/fused_softmax.py
16d15
< 
18d16
< import torch.nn as nn
34a33
> 
38d36
< 
46a45
> 
50d48
< 
68c66,68
<         softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
---
>         softmax_results = scaled_masked_softmax_cuda.forward(
>             inputs, mask, scale_t[0]
>         )
84,115c84
< class ScaledSoftmax(torch.autograd.Function):
<     """
<     Fused operation which performs following two operations in sequence
<     1. Scale the tensor.
<     2. Perform softmax.
<     """
< 
<     @staticmethod
<     def forward(ctx, inputs, scale):
<         import scaled_softmax_cuda
< 
<         scale_t = torch.tensor([scale])
< 
<         softmax_results = scaled_softmax_cuda.forward(
<             inputs, scale_t[0]
<         )
<         ctx.save_for_backward(softmax_results, scale_t)
<         return softmax_results
< 
<     @staticmethod
<     def backward(ctx, output_grads):
<         import scaled_softmax_cuda
< 
<         softmax_results, scale_t = ctx.saved_tensors
< 
<         input_grads = scaled_softmax_cuda.backward(
<             output_grads, softmax_results, scale_t[0]
<         )
<         return input_grads, None, None
< 
< 
< class FusedScaleMaskSoftmax(nn.Module):
---
> class FusedScaleMaskSoftmax(torch.nn.Module):
118d86
< 
121d88
<         input_in_bf16: flag to indicate if input in bf16 data format.
123d89
<         scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
126a93
> 
142,144c109,110
<         assert not (
<             self.input_in_fp16 and self.input_in_bf16
<         ), "both fp16 and bf16 flags cannot be active at the same time."
---
>         assert not (self.input_in_fp16 and self.input_in_bf16),\
>             'both fp16 and bf16 flags cannot be active at the same time.'
155c121
< 
---
>  
159,161c125,148
< 
<         if self.is_kernel_available(mask, *input.size()):
<             return self.forward_fused_softmax(input, mask)
---
>         data_size = input.size()
>         query_seq_len = data_size[-2]
>         key_seq_len = data_size[-1]
>         attn_batch_size = data_size[0] * data_size[1]
> 
>         # constraints on various tensor dimensions to enable warp based
>         # optimization and upper triangular optimization (for causal mask)
>         custom_kernel_constraint = key_seq_len > 16 and key_seq_len <= 2048 and \
>             query_seq_len % 4 == 0 and attn_batch_size % 4 == 0
> 
>         # invoke custom kernel
>         if self.input_in_float16 and mask is not None and \
>             custom_kernel_constraint and self.scaled_masked_softmax_fusion:
>             scale = self.scale if self.scale is not None else 1.0
> 
>             if self.attn_mask_type == AttnMaskType.causal:
>                 assert query_seq_len == key_seq_len, \
>                     "causal mask is only for self attention"
>                 input = input.view(-1, query_seq_len, key_seq_len)
>                 probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
>                 probs = probs.view(*data_size)
>             else:
>                 assert self.attn_mask_type == AttnMaskType.padding
>                 probs = ScaledMaskedSoftmax.apply(input, mask, scale)
163,166c150,151
<             return self.forward_torch_softmax(input, mask)
< 
<     def is_kernel_available(self, mask, b, np, sq, sk):
<         attn_batches = b * np
---
>             if self.input_in_float16 and self.softmax_in_fp32:
>                 input = input.float()
168,180c153,160
<         if (
<             self.scaled_masked_softmax_fusion  # user want to fuse
<             and self.input_in_float16  # input must be fp16
<             and 16 < sk <= 4096  # sk must be 16 ~ 2048
<             and sq % 4 == 0  # sq must be divisor of 4
<             and attn_batches % 4 == 0  # np * b must be divisor of 4
<         ):
<             if 0 <= sk <= 4096:
<                 batch_per_block = self.get_batch_per_block(sq, sk, b, np)
< 
<                 if self.attn_mask_type == AttnMaskType.causal:
<                     if attn_batches % batch_per_block == 0:
<                         return True
---
>             if self.scale is not None:
>                 input = input * self.scale
>             mask_output = self.mask_func(input, mask) if mask is not None else input
>             probs = torch.nn.Softmax(dim=-1)(mask_output)
> 
>             if self.input_in_float16 and self.softmax_in_fp32:
>                 if self.input_in_fp16:
>                     probs = probs.half()
182,217c162
<                     if sq % batch_per_block == 0:
<                         return True
<         return False
< 
<     def forward_fused_softmax(self, input, mask):
<         b, np, sq, sk = input.size()
<         scale = self.scale if self.scale is not None else 1.0
< 
<         if self.attn_mask_type == AttnMaskType.causal:
<             assert sq == sk, "causal mask is only for self attention"
< 
<             # input is 3D tensor (attn_batches, sq, sk)
<             input = input.view(-1, sq, sk)
<             probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
<             return probs.view(b, np, sq, sk)
<         else:
<             # input is 4D tensor (b, np, sq, sk)
<             if mask is not None:
<                 return ScaledMaskedSoftmax.apply(input, mask, scale)
<             else:
<                 return ScaledSoftmax.apply(input, scale)
< 
<     def forward_torch_softmax(self, input, mask):
<         if self.input_in_float16 and self.softmax_in_fp32:
<             input = input.float()
< 
<         if self.scale is not None:
<             input = input * self.scale
<         mask_output = self.mask_func(input, mask) if mask is not None else input
<         probs = torch.nn.Softmax(dim=-1)(mask_output)
< 
<         if self.input_in_float16 and self.softmax_in_fp32:
<             if self.input_in_fp16:
<                 probs = probs.half()
<             else:
<                 probs = probs.bfloat16()
---
>                     probs = probs.bfloat16()
220,225d164
< 
<     @staticmethod
<     def get_batch_per_block(sq, sk, b, np):
<         import scaled_masked_softmax_cuda
< 
<         return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
Only in ../Megatron-LM-base/megatron/model: gpt_model.py
diff --color -r runtime/megatron/model/__init__.py ../Megatron-LM-base/megatron/model/__init__.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/model/__init__.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
26a19,22
> from .bert_model import BertModel
> from .gpt_model import GPTModel
> from .t5_model import T5Model
> from .language_model import get_language_model
28,30d23
< from .flex_gpt import FlexGPTModel
< from .flex_t5 import FlexT5Model
< from .flex_resnet import FlexResNet
\ No newline at end of file
diff --color -r runtime/megatron/model/language_model.py ../Megatron-LM-base/megatron/model/language_model.py
22a23,27
> from .module import MegatronModule
> from megatron.model.enums import LayerType, AttnMaskType
> from megatron.model.transformer import ParallelTransformer
> from megatron.model.utils import get_linear_layer
> from megatron.model.utils import init_method_normal, scaled_init_method_normal
34d38
<     
41,52d44
< def new_parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
<     """LM logits using word embedding weights."""
<     # Parallel logits.
<     input_parallel = mpu._PrimReplicate.apply(input_)
<     # Matrix multiply.
<     if bias is None:
<         logits_parallel = F.linear(input_parallel, word_embeddings_weight)
<     else:
<         logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)
<     # Gather if needed.
<     if parallel_output:
<         return logits_parallel
54c46,464
<     return mpu._PrimAllGather(logits_parallel)
---
> def get_language_model(num_tokentypes, add_pooler,
>                        encoder_attn_mask_type, init_method=None,
>                        scaled_init_method=None, add_decoder=False,
>                        decoder_attn_mask_type=AttnMaskType.causal,
>                        pre_process=True, post_process=True):
>     """Build language model and return along with the key to save."""
>     args = get_args()
> 
>     if init_method is None:
>         init_method = init_method_normal(args.init_method_std)
> 
>     if scaled_init_method is None:
>         scaled_init_method = scaled_init_method_normal(args.init_method_std,
>                                                        args.num_layers)
> 
>     # Language model.
>     language_model = TransformerLanguageModel(
>         init_method,
>         scaled_init_method,
>         encoder_attn_mask_type,
>         num_tokentypes=num_tokentypes,
>         add_decoder=add_decoder,
>         decoder_attn_mask_type=decoder_attn_mask_type,
>         add_pooler=add_pooler,
>         pre_process=pre_process,
>         post_process=post_process
>     )
>     # key used for checkpoints.
>     language_model_key = 'language_model'
> 
>     return language_model, language_model_key
> 
> 
> class Pooler(MegatronModule):
>     """Pooler layer.
> 
>     Pool hidden states of a specific token (for example start of the
>     sequence) and add a linear transformation followed by a tanh.
> 
>     Arguments:
>         hidden_size: hidden size
>         init_method: weight initialization method for the linear layer.
>             bias is set to zero.
>     """
> 
>     def __init__(self, hidden_size, init_method):
>         super(Pooler, self).__init__()
>         self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
> 
>     def forward(self, hidden_states, sequence_index=0):
>         # hidden_states: [b, s, h]
>         # sequence_index: index of the token to pool.
>         pooled = hidden_states[:, sequence_index, :]
>         pooled = self.dense(pooled)
>         pooled = torch.tanh(pooled)
>         return pooled
> 
> 
> class Embedding(MegatronModule):
>     """Language model embeddings.
> 
>     Arguments:
>         hidden_size: hidden size
>         vocab_size: vocabulary size
>         max_sequence_length: maximum size of sequence. This
>                              is used for positional embedding
>         embedding_dropout_prob: dropout probability for embeddings
>         init_method: weight initialization method
>         num_tokentypes: size of the token-type embeddings. 0 value
>                         will ignore this embedding
>     """
> 
>     def __init__(self,
>                  hidden_size,
>                  vocab_size,
>                  max_sequence_length,
>                  embedding_dropout_prob,
>                  init_method,
>                  num_tokentypes=0):
>         super(Embedding, self).__init__()
> 
>         self.hidden_size = hidden_size
>         self.init_method = init_method
>         self.num_tokentypes = num_tokentypes
> 
>         args = get_args()
> 
>         # Word embeddings (parallel).
>         self.word_embeddings = mpu.VocabParallelEmbedding(
>             vocab_size, self.hidden_size,
>             init_method=self.init_method)
>         self._word_embeddings_key = 'word_embeddings'
> 
>         # Position embedding (serial).
>         self.position_embeddings = torch.nn.Embedding(
>             max_sequence_length, self.hidden_size)
>         self._position_embeddings_key = 'position_embeddings'
>         # Initialize the position embeddings.
>         self.init_method(self.position_embeddings.weight)
> 
>         # Token type embedding.
>         # Add this as an optional field that can be added through
>         # method call so we can load a pretrain model without
>         # token types and add them as needed.
>         self._tokentype_embeddings_key = 'tokentype_embeddings'
>         if self.num_tokentypes > 0:
>             self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
>                                                            self.hidden_size)
>             # Initialize the token-type embeddings.
>             self.init_method(self.tokentype_embeddings.weight)
>         else:
>             self.tokentype_embeddings = None
> 
>         # Embeddings dropout
>         self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
> 
>     def add_tokentype_embeddings(self, num_tokentypes):
>         """Add token-type embedding. This function is provided so we can add
>         token-type embeddings in case the pretrained model does not have it.
>         This allows us to load the model normally and then add this embedding.
>         """
>         if self.tokentype_embeddings is not None:
>             raise Exception('tokentype embeddings is already initialized')
>         if torch.distributed.get_rank() == 0:
>             print('adding embedding for {} tokentypes'.format(num_tokentypes),
>                   flush=True)
>         self.num_tokentypes = num_tokentypes
>         self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
>                                                        self.hidden_size)
>         # Initialize the token-type embeddings.
>         args = get_args()
>         self.init_method(self.tokentype_embeddings.weight)
> 
>     def forward(self, input_ids, position_ids, tokentype_ids=None):
>         # Embeddings.
>         words_embeddings = self.word_embeddings(input_ids)
>         position_embeddings = self.position_embeddings(position_ids)
>         embeddings = words_embeddings + position_embeddings
>         if tokentype_ids is not None:
>             assert self.tokentype_embeddings is not None
>             embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
>         else:
>             assert self.tokentype_embeddings is None
> 
>         # Dropout.
>         embeddings = self.embedding_dropout(embeddings)
> 
>         return embeddings
> 
>     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
>                                        keep_vars=False):
>         """For easy load."""
> 
>         state_dict_ = {}
>         state_dict_[self._word_embeddings_key] \
>             = self.word_embeddings.state_dict(destination, prefix, keep_vars)
>         state_dict_[self._position_embeddings_key] \
>             = self.position_embeddings.state_dict(
>                 destination, prefix, keep_vars)
>         if self.num_tokentypes > 0:
>             state_dict_[self._tokentype_embeddings_key] \
>                 = self.tokentype_embeddings.state_dict(
>                     destination, prefix, keep_vars)
> 
>         return state_dict_
> 
>     def load_state_dict(self, state_dict, strict=True):
>         """Customized load."""
> 
>         # Word embedding.
>         if self._word_embeddings_key in state_dict:
>             state_dict_ = state_dict[self._word_embeddings_key]
>         else:
>             # for backward compatibility.
>             state_dict_ = {}
>             for key in state_dict.keys():
>                 if 'word_embeddings' in key:
>                     state_dict_[key.split('word_embeddings.')[1]] \
>                         = state_dict[key]
>         self.word_embeddings.load_state_dict(state_dict_, strict=strict)
> 
>         # Position embedding.
>         if self._position_embeddings_key in state_dict:
>             state_dict_ = state_dict[self._position_embeddings_key]
>         else:
>             # for backward compatibility.
>             state_dict_ = {}
>             for key in state_dict.keys():
>                 if 'position_embeddings' in key:
>                     state_dict_[key.split('position_embeddings.')[1]] \
>                         = state_dict[key]
>         self.position_embeddings.load_state_dict(state_dict_, strict=strict)
> 
>         # Tokentype embedding.
>         if self.num_tokentypes > 0:
>             state_dict_ = {}
>             if self._tokentype_embeddings_key in state_dict:
>                 state_dict_ = state_dict[self._tokentype_embeddings_key]
>             else:
>                 # for backward compatibility.
>                 for key in state_dict.keys():
>                     if 'tokentype_embeddings' in key:
>                         state_dict_[key.split('tokentype_embeddings.')[1]] \
>                             = state_dict[key]
>             if len(state_dict_.keys()) > 0:
>                 self.tokentype_embeddings.load_state_dict(state_dict_,
>                                                           strict=strict)
>             else:
>                 print('***WARNING*** expected tokentype embeddings in the '
>                       'checkpoint but could not find it', flush=True)
> 
> 
> class TransformerLanguageModel(MegatronModule):
>     """Transformer language model.
> 
>     Arguments:
>         transformer_hparams: transformer hyperparameters
>         vocab_size: vocabulary size
>         max_sequence_length: maximum size of sequence. This
>                              is used for positional embedding
>         embedding_dropout_prob: dropout probability for embeddings
>         num_tokentypes: size of the token-type embeddings. 0 value
>                         will ignore this embedding
>     """
> 
>     def __init__(self,
>                  init_method,
>                  output_layer_init_method,
>                  encoder_attn_mask_type,
>                  num_tokentypes=0,
>                  add_decoder=False,
>                  decoder_attn_mask_type=AttnMaskType.causal,
>                  add_pooler=False,
>                  pre_process=True,
>                  post_process=True):
>         super(TransformerLanguageModel, self).__init__()
>         args = get_args()
> 
>         self.pre_process = pre_process
>         self.post_process = post_process
>         self.hidden_size = args.hidden_size
>         self.num_tokentypes = num_tokentypes
>         self.init_method = init_method
>         self.encoder_attn_mask_type = encoder_attn_mask_type
>         self.add_decoder = add_decoder
>         self.decoder_attn_mask_type = decoder_attn_mask_type
>         self.add_pooler = add_pooler
> 
>         # Embeddings.
>         if self.pre_process:
>             self.embedding = Embedding(self.hidden_size,
>                                        args.padded_vocab_size,
>                                        args.max_position_embeddings,
>                                        args.hidden_dropout,
>                                        self.init_method,
>                                        self.num_tokentypes)
>             self._embedding_key = 'embedding'
> 
>         # Transformer.
>         self.encoder = ParallelTransformer(
>             self.init_method,
>             output_layer_init_method,
>             self_attn_mask_type=self.encoder_attn_mask_type,
>             pre_process=self.pre_process,
>             post_process=self.post_process
>         )
>         self._encoder_key = 'encoder'
> 
>         # Decoder
>         if self.add_decoder:
>             assert args.pipeline_model_parallel_size == 1, \
>                 'pipeline parallelism is not supported in the presence of decoder'
>             self.decoder = ParallelTransformer(
>                 self.init_method,
>                 output_layer_init_method,
>                 layer_type=LayerType.decoder,
>                 self_attn_mask_type=self.decoder_attn_mask_type)
>             self._decoder_key = 'decoder'
> 
>         if self.post_process:
>             # Pooler.
>             if self.add_pooler:
>                 self.pooler = Pooler(self.hidden_size, self.init_method)
>                 self._pooler_key = 'pooler'
> 
>     def set_input_tensor(self, input_tensor):
>         """ See megatron.model.transformer.set_input_tensor()"""
>         self.encoder.set_input_tensor(input_tensor)
> 
>     def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
>                 dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
>                 enc_dec_attn_mask=None, tokentype_ids=None, layer_past=None,
>                 get_key_value=False, pooling_sequence_index=0,
>                 enc_hidden_states=None, output_enc_hidden=False):
> 
>         # Embeddings.
>         if self.pre_process:
>             embedding_output = self.embedding(enc_input_ids, enc_position_ids,
>                                               tokentype_ids=tokentype_ids)
>             encoder_input = embedding_output
>         else:
>             encoder_input = None
> 
>         # encoder.
>         if enc_hidden_states is None:
>             encoder_output = self.encoder(encoder_input,
>                                           enc_attn_mask,
>                                           layer_past=layer_past,
>                                           get_key_value=get_key_value)
>         else:
>             encoder_output = enc_hidden_states.to(encoder_input.dtype)
> 
>         if self.post_process:
>             if self.add_pooler:
>                 pooled_output = self.pooler(encoder_output,
>                                             pooling_sequence_index)
> 
>         # output_enc_hidden refers to when we just need the encoder's
>         # output. For example, it is helpful to compute
>         # similarity between two sequences by average pooling
>         if not self.add_decoder or output_enc_hidden:
>             if self.add_pooler and self.post_process:
>                 return encoder_output, pooled_output
>             else:
>                 return encoder_output
> 
>         # Decoder Embedding
>         dec_embedding_output = self.embedding(dec_input_ids,
>                                               dec_position_ids)
>         # decoder
>         decoder_output = self.decoder(dec_embedding_output,
>                                       dec_attn_mask,
>                                       layer_past=layer_past,
>                                       get_key_value=get_key_value,
>                                       encoder_output=encoder_output,
>                                       enc_dec_attn_mask=enc_dec_attn_mask)
> 
>         if self.add_pooler and self.post_process:
>             return decoder_output, encoder_output, pooled_output
>         else:
>             return decoder_output, encoder_output
> 
>     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
>                                        keep_vars=False):
>         """For easy load."""
> 
>         state_dict_ = {}
>         if self.pre_process:
>             state_dict_[self._embedding_key] \
>                 = self.embedding.state_dict_for_save_checkpoint(
>                     destination, prefix, keep_vars)
>         state_dict_[self._encoder_key] \
>             = self.encoder.state_dict_for_save_checkpoint(
>                 destination, prefix, keep_vars)
>         if self.post_process:
>             if self.add_pooler:
>                 state_dict_[self._pooler_key] \
>                     = self.pooler.state_dict_for_save_checkpoint(
>                         destination, prefix, keep_vars)
>         if self.add_decoder:
>             state_dict_[self._decoder_key] \
>                 = self.decoder.state_dict_for_save_checkpoint(
>                     destination, prefix, keep_vars)
> 
>         return state_dict_
> 
>     def load_state_dict(self, state_dict, strict=True):
>         """Customized load."""
> 
>         # Embedding.
>         if self.pre_process:
>             if self._embedding_key in state_dict:
>                 state_dict_ = state_dict[self._embedding_key]
>             else:
>                 # for backward compatibility.
>                 state_dict_ = {}
>                 for key in state_dict.keys():
>                     if '_embeddings' in key:
>                         state_dict_[key] = state_dict[key]
>             self.embedding.load_state_dict(state_dict_, strict=strict)
> 
>         # Encoder.
>         if self._encoder_key in state_dict:
>             state_dict_ = state_dict[self._encoder_key]
>         # for backward compatibility.
>         elif 'transformer' in state_dict:
>             state_dict_ = state_dict['transformer']
>         else:
>             # for backward compatibility.
>             state_dict_ = {}
>             for key in state_dict.keys():
>                 if 'transformer.' in key:
>                     state_dict_[key.split('transformer.')[1]] = state_dict[key]
> 
>         # for backward compatibility.
>         state_dict_self_attention = {}
>         for key in state_dict_.keys():
>             if '.attention.' in key:
>                 state_dict_self_attention[key.replace(".attention.",
>                     ".self_attention.")] = state_dict_[key]
>             else:
>                 state_dict_self_attention[key] = state_dict_[key]
>         state_dict_ = state_dict_self_attention
> 
>         self.encoder.load_state_dict(state_dict_, strict=strict)
> 
>         if self.post_process:
>             # pooler
>             if self.add_pooler:
>                 assert 'pooler' in state_dict, \
>                     'could not find data for pooler in the checkpoint'
>                 self.pooler.load_state_dict(state_dict[self._pooler_key],
>                                             strict=strict)
>         # decoder
>         if self.add_decoder:
>             assert 'decoder' in state_dict, \
>                 'could not find data for pooler in the checkpoint'
>             self.decoder.load_state_dict(state_dict[self._decoder_key],
>                                          strict=strict)
diff --color -r runtime/megatron/model/module.py ../Megatron-LM-base/megatron/model/module.py
1,8d0
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/model/module.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
53,77c45
<         ## flexpipe
<         self.input_extra_tensors_info = {}
<         self.output_extra_tensors_info = {}
<         self.shared_weights_info = {}        
<         self.is_last_op = False
<         ## resharding
<         self.output_extra_specs = None
<         self.output_extra_mats_info = None
<         self.required_input_extra_specs = {}
<         self.input_extra_mats = None
<         self.new_input_extra_tensors = {}
<         self.tmp_buffer = None
<         self.elementwise = False
<         self.input_mats = None
<         self.input_extra_mats = None
< 
<     def parse_op_configs(self, config):
<         self.name = config.name
<         self.prev_name = config.prev_name
<         self.input_tensors_info = config.input_tensors_info
<         self.output_tensors_info = config.output_tensors_info
<         self.input_extra_tensors_info = config.input_extra_tensors_info
<         self.output_extra_tensors_info = config.output_extra_tensors_info        
<         self.shared_weights_info = config.shared_weights_info
<         
---
> 
87,88c55
<             # return self.language_model.embedding.word_embeddings.weight
<             return self.language_model.ops[0].word_embeddings.weight
---
>             return self.language_model.embedding.word_embeddings.weight
93d59
< 
99,146d64
<     # def initialize_word_embeddings(self, init_method_normal):
<     #     args = get_args()
<     #     if not self.share_word_embeddings:
<     #         raise Exception('initialize_word_embeddings() was called but '
<     #                         'share_word_embeddings is false')
< 
<     #     # This function just initializes the word embeddings in the final stage
<     #     # when we are using pipeline parallelism. If we aren't using pipeline
<     #     # parallelism there is nothing to do.
<     #     if args.pipeline_model_parallel_size == 1:
<     #         return
< 
<     #     # Parameters are shared between the word embeddings layer, and the
<     #     # heads at the end of the model. In a pipelined setup with more than
<     #     # one stage, the initial embedding layer and the head are on different
<     #     # workers, so we do the following:
<     #     # 1. Create a second copy of word_embeddings on the last stage, with
<     #     #    initial parameters of 0.0.
<     #     # 2. Do an all-reduce between the first and last stage to ensure that
<     #     #    the two copies of word_embeddings start off with the same
<     #     #    parameter values.
<     #     # 3. In the training loop, before an all-reduce between the grads of
<     #     #    the two word_embeddings layers to ensure that every applied weight
<     #     #    update is the same on both stages.
<     #     if mpu.is_pipeline_last_stage():
<     #         assert not mpu.is_pipeline_first_stage()
<     #         self._word_embeddings_for_head_key = 'word_embeddings_for_head'
<     #         # set word_embeddings weights to 0 here, then copy first
<     #         # stage's weights using all_reduce below.
<     #         self.word_embeddings = mpu.VocabParallelEmbedding(
<     #             args.padded_vocab_size, args.hidden_size,
<     #             init_method=init_method_normal(args.init_method_std))
<     #         self.word_embeddings.weight.data.fill_(0)
<     #         self.word_embeddings.weight.shared = True
< 
<     #     # Ensure that first and last stages have the same initial parameter
<     #     # values.
<     #     if torch.distributed.is_initialized():
<     #         if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
<     #             torch.distributed.all_reduce(self.word_embeddings_weight().data,
<     #                                          group=mpu.get_embedding_group())
<     #     else:
<     #         print("WARNING! Distributed processes aren't initialized, so "
<     #               "word embeddings in the last layer are not initialized. "
<     #               "If you are just manipulating a model this is fine, but "
<     #               "this needs to be handled manually. If you are training "
<     #               "something is definitely wrong.")
< 
151a70,76
> 
>         # This function just initializes the word embeddings in the final stage
>         # when we are using pipeline parallelism. If we aren't using pipeline
>         # parallelism there is nothing to do.
>         if args.pipeline_model_parallel_size == 1:
>             return
> 
165,209c90,111
<             if not mpu.is_pipeline_first_stage():
<                 self._word_embeddings_for_head_key = 'word_embeddings_for_head'
<                 # If first and last stages are different, set word_embeddings
<                 # weights to 0 here, then copy first stage's weights using
<                 # all_reduce below.
<                 self.word_embeddings = mpu.VocabParallelEmbedding(
<                     args.padded_vocab_size, args.hidden_size,
<                     init_method=init_method_normal(args.init_method_std))
<                 self.word_embeddings.weight.data.fill_(0)
<                 self.word_embeddings.weight.shared = True
<         # Ensure that first and last stages have the same initial parameter values.
<         if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
<             torch.distributed.all_reduce(self.word_embeddings_weight().data,
<                                         group=mpu.get_embedding_group())
< 
< class RopalaModule(torch.nn.Module):
<     def __init__(self, op_index, name, prev_name, is_last_op=False):
<         super(RopalaModule, self).__init__()
<         self.name = name
<         self.prev_name = prev_name
<         self.op_index = op_index
<         self.is_last_op = is_last_op
< 
<         self.tp_size = mpu.get_op_tp_size(op_index)
<         self.dp_size = mpu.get_op_dp_size(op_index)
< 
<         self.input_tensors_info = {}
<         self.output_tensors_info = {}
<         self.input_extra_tensors_info = {}
<         self.output_extra_tensors_info = {}
<         self.shared_weights_info = {}        
<         
<         ## resharding
<         self.output_extra_specs = None
<         self.output_extra_mats_info = None
<         self.required_input_extra_specs = {}
<         self.input_extra_mats = None
<         self.new_input_extra_tensors = {}
<         self.tmp_buffer = None
<         self.elementwise = False
<         self.input_mats = None
<         self.input_extra_mats = None
< 
<         ## for profiling
<         self.weight_size = 0
---
>             assert not mpu.is_pipeline_first_stage()
>             self._word_embeddings_for_head_key = 'word_embeddings_for_head'
>             # set word_embeddings weights to 0 here, then copy first
>             # stage's weights using all_reduce below.
>             self.word_embeddings = mpu.VocabParallelEmbedding(
>                 args.padded_vocab_size, args.hidden_size,
>                 init_method=init_method_normal(args.init_method_std))
>             self.word_embeddings.weight.data.fill_(0)
>             self.word_embeddings.weight.shared = True
> 
>         # Ensure that first and last stages have the same initial parameter
>         # values.
>         if torch.distributed.is_initialized():
>             if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
>                 torch.distributed.all_reduce(self.word_embeddings_weight().data,
>                                              group=mpu.get_embedding_group())
>         else:
>             print("WARNING! Distributed processes aren't initialized, so "
>                   "word embeddings in the last layer are not initialized. "
>                   "If you are just manipulating a model this is fine, but "
>                   "this needs to be handled manually. If you are training "
>                   "something is definitely wrong.")
211,251d112
<     def get_shared_tensor(self, grads=False):
<         args = get_args()
<         tensor_dict = {}
<         for key in sorted(self.shared_weights_info):
<             if key == "word_embeddings":
<                 if grads:
<                     if args.DDP_impl == 'local':
<                         tensor_dict["word_embeddings"] = self.word_embeddings.weight.main_grad
<                     else:
<                         tensor_dict["word_embeddings"] = self.word_embeddings.weight.grad
<                 else:
<                     tensor_dict["word_embeddings"] = self.word_embeddings.weight.data
<             elif key == "position_embeddings":
<                 if grads:
<                     if args.DDP_impl == 'local':
<                         tensor_dict["position_embeddings"] = self.position_embeddings.weight.main_grad
<                     else:
<                         tensor_dict["position_embeddings"] = self.position_embeddings.weight.grad
<                 else:
<                     tensor_dict["position_embeddings"] = self.position_embeddings.weight.data          
<         return tensor_dict
< 
<     def set_shared_tensor(self, new_data, grads=False):
<         args = get_args()
<         for key in sorted(self.shared_weights_info):
<             if key == "word_embeddings":
<                 if grads:
<                     if args.DDP_impl == 'local':
<                         self.word_embeddings.weight.main_grad = new_data["word_embeddings"][0]
<                     else:
<                         self.word_embeddings.weight.grad = new_data["word_embeddings"][0]
<                 else:
<                     self.word_embeddings.weight.data = new_data["word_embeddings"][0]
<             elif key == "position_embeddings":
<                 if grads:
<                     if args.DDP_impl == 'local':
<                         self.position_embeddings.weight.main_grad = new_data["position_embeddings"][0]
<                     else:
<                         self.position_embeddings.grad = new_data["position_embeddings"][0]
<                 else:
<                     self.position_embeddings.weight.data = new_data["position_embeddings"][0]
Only in ../Megatron-LM-base/megatron/model: multiple_choice.py
Only in ../Megatron-LM-base/megatron/model: realm_model.py
Only in ../Megatron-LM-base/megatron/model: t5_model.py
Only in ../Megatron-LM-base/megatron/model: transformer.py
Only in ../Megatron-LM-base/megatron/model: vit_model.py
diff --color -r runtime/megatron/mpu/cross_entropy.py ../Megatron-LM-base/megatron/mpu/cross_entropy.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/mpu/cross_entropy.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
29c21
< from .initialize import get_tensor_model_parallel_world_size, get_group, get_resharding_group, get_op_tp_size
---
> from .initialize import get_tensor_model_parallel_world_size
37c29
<         # print(f"[DEBUG] vocab_parallel_logits: {list(vocab_parallel_logits.size())}")
---
> 
66,67d57
<         # print(f"[DEBUG] arange_1d = {list(arange_1d.size())}")
<         # print(f"[DEBUG] masked_target_1d = {list(masked_target_1d.size())}")                                 
121,209d110
< 
< 
< class _NewVocabParallelCrossEntropy(torch.autograd.Function):
< 
<     @staticmethod
<     def forward(ctx, vocab_parallel_logits, target):
<         
<         # Maximum value along vocab dimension across all GPUs.
<         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
<         torch.distributed.all_reduce(logits_max,
<                                      op=torch.distributed.ReduceOp.MAX,
<                                      group=get_group(get_resharding_group()))
<         # Subtract the maximum value.
<         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
< 
<         # Get the partition's vocab indecies
<         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
<         partition_vocab_size = vocab_parallel_logits.size()[-1]
< 
<         rank = torch.distributed.get_rank(group=get_group(get_resharding_group()))
<         world_size = len(get_resharding_group())
<         vocab_start_index, vocab_end_index = get_vocab_range(
<             partition_vocab_size, rank, world_size)
< 
<         # Create a mask of valid vocab ids (1 means it needs to be masked).
<         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
<         masked_target = target.clone() - vocab_start_index
<         masked_target[target_mask] = 0
< 
<         # Get predicted-logits = logits[target].
<         # For Simplicity, we convert logits to a 2-D tensor with size
<         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
<         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
<         masked_target_1d = masked_target.view(-1)
<         arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
<                                  device=logits_2d.device)
< 
<         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
<         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
<         predicted_logits = predicted_logits_1d.view_as(target)
<         predicted_logits[target_mask] = 0.0
<         # All reduce is needed to get the chunks from other GPUs.
<         torch.distributed.all_reduce(predicted_logits,
<                                      op=torch.distributed.ReduceOp.SUM,
<                                      group=get_group(get_resharding_group()))
< 
<         # Sum of exponential of logits along vocab dimension across all GPUs.
<         exp_logits = vocab_parallel_logits
<         torch.exp(vocab_parallel_logits, out=exp_logits)
<         sum_exp_logits = exp_logits.sum(dim=-1)
<         torch.distributed.all_reduce(sum_exp_logits,
<                                      op=torch.distributed.ReduceOp.SUM,
<                                      group=get_group(get_resharding_group()))
< 
<         # Loss = log(sum(exp(logits))) - predicted-logit.
<         loss = torch.log(sum_exp_logits) - predicted_logits
< 
<         # Store softmax, target-mask and masked-target for backward pass.
<         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
<         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
< 
<         return loss
< 
<     @staticmethod
<     def backward(ctx, grad_output):
< 
<         # Retreive tensors from the forward path.
<         softmax, target_mask, masked_target_1d = ctx.saved_tensors
< 
<         # All the inputs have softmax as thier gradient.
<         grad_input = softmax
<         # For simplicity, work with the 2D gradient.
<         partition_vocab_size = softmax.size()[-1]
<         grad_2d = grad_input.view(-1, partition_vocab_size)
< 
<         # Add the gradient from matching classes.
<         arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
<                                  device=grad_2d.device)
<         grad_2d[arange_1d, masked_target_1d] -= (
<             1.0 - target_mask.view(-1).float())
< 
<         # Finally elementwise multiplication with the output gradients.
<         grad_input.mul_(grad_output.unsqueeze(dim=-1))
< 
<         return grad_input, None
< 
< def new_vocab_parallel_cross_entropy(vocab_parallel_logits, target):
<     """Helper function for the cross entropy."""
<     return _NewVocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --color -r runtime/megatron/mpu/data.py ../Megatron-LM-base/megatron/mpu/data.py
21c21
< from megatron import get_timers
---
> 
86,87d85
<     timers = get_timers()
< 
91c89
<                                                                           data)                                                                      
---
>                                                                           data)
99c97
<             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()       
---
>             [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
103c101
<                                    dtype=datatype)                             
---
>                                    dtype=datatype)
107c105
<                                 group=get_tensor_model_parallel_group())                          
---
>                                 group=get_tensor_model_parallel_group())
116a115
> 
diff --color -r runtime/megatron/mpu/initialize.py ../Megatron-LM-base/megatron/mpu/initialize.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/mpu/initialize.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
30c22
< from megatron import get_args
---
> 
37a30,31
> # Embedding group.
> _EMBEDDING_GROUP = None
50,81c44,46
< # For FlexPipe
< _NUM_OPS_IN_EACH_STAGE_LIST =None
< _OPS_START_INDEX_LIST = None
< _OPS_END_INDEX_LIST = None
< 
< _CHILD_RANKS = None
< _PARENT_RANKS = None
< 
< _FLEXPIPE_PREV_RANKS = None
< _FLEXPIPE_NEXT_RANKS = None
< 
< _VIRTUAL_PIPELINE_NEXT_FORWARD_MODEL_PARALLEL_RANK = None
< _VIRTUAL_PIPELINE_NEXT_BACKWARD_MODEL_PARALLEL_RANK = None
< _VIRTUAL_PIPELINE_BACKWARD_MODEL_PARALLEL_RANK = None
< 
< _RANKS_IN_EACH_PIPELINE_STAGE = None
< 
< _BWD_SEND_INFO = None
< _FWD_RECV_INFO = None
< _FWD_SEND_INFO = None
< _BWD_RECV_INFO = None
< 
< all_groups = {}
< _TP_SIZE_PER_OP = None
< _DP_SIZE_PER_OP = None
< _RESHARDING_GROUP = None
< _RESHARDING_RANK = None
< _RESHARDING_DIM = None
< _OP_RESHARDING_RANKS = []
< 
< _TENSOR_MODEL_PARALLEL_RANKS = None
< _DATA_PARALLEL_RANKS = None
---
> # A list of global ranks for each pipeline group to ease calculation of the source
> # rank when broadcasting from the first or last pipeline stage
> _PIPELINE_GLOBAL_RANKS = None
87,92c52,55
< def initialize_model_parallel_flexpipe():
<     """
<     Initialize model data parallel groups for FlexPipe.
<     Generate _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP, _TENSOR_MODEL_PARALLEL_GROUP, _PIPELINE_MODEL_PARALLEL_GROUP in this function.
<     Because FlexPipe supports different tensor model parallelism size at each pipeline stage,
<     this function is quite different from original Megatron.
---
> 
> def initialize_model_parallel(tensor_model_parallel_size_=1,
>                               pipeline_model_parallel_size_=1,
>                               virtual_pipeline_model_parallel_size_=None):
94,109c57
<     args = get_args()
<     num_ops_in_each_stage = args.num_ops_in_each_stage
<     virtual_pipeline_model_parallel_size_ = args.virtual_pipeline_model_parallel_size
< 
<     global _TP_SIZE_PER_OP, _DP_SIZE_PER_OP
<     _TP_SIZE_PER_OP = []
<     for i in range(len(args.model_parallel_size_of_each_op)):
<         _TP_SIZE_PER_OP += args.model_parallel_size_of_each_op[i]
<     _DP_SIZE_PER_OP = [] 
<     for i in range(len(args.data_parallel_size_of_each_op)):
<         _DP_SIZE_PER_OP += args.data_parallel_size_of_each_op[i]
< 
<     input_mp_size_of_each_stage = []
<     output_mp_size_of_each_stage = []
<     input_dp_size_of_each_stage = []
<     output_dp_size_of_each_stage = []    
---
>     Initialize model data parallel groups.
110a59,78
>     Arguments:
>         tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
>         pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
> 
>     Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
>     use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
>     the model pipeline. The present function will
>     create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
>     and 8 data-parallel groups as:
>         8 data_parallel groups:
>             [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
>         8 tensor model-parallel groups:
>             [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
>         4 pipeline model-parallel groups:
>             [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
>     Note that for efficiency, the caller should make sure adjacent ranks
>     are on the same DGX box. For example if we are using 2 DGX-1 boxes
>     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
>     ranks 8 to 15 belong to the second box.
>     """
112,113c80,84
<         print('> initializing FlexPipe...')
< 
---
>         print('> initializing tensor model parallel with size {}'.format(
>             tensor_model_parallel_size_))
>         print('> initializing pipeline model parallel with size {}'.format(
>             pipeline_model_parallel_size_))
>     # Get world size and rank. Ensure some consistencies.
116c87,102
<     rank = torch.distributed.get_rank()
---
>     tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
>     pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
>     ensure_divisibility(world_size,
>                         tensor_model_parallel_size * pipeline_model_parallel_size)
>     data_parallel_size = world_size // (tensor_model_parallel_size *
>                                         pipeline_model_parallel_size)
> 
>     num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
>     num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
>     num_data_parallel_groups = world_size // data_parallel_size
> 
>     if virtual_pipeline_model_parallel_size_ is not None:
>         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
>         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
>         _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
>         _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_
118,140c104
<     global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
<     global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
<     _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
<     _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_    
< 
<     global _NUM_OPS_IN_EACH_STAGE_LIST
<     _NUM_OPS_IN_EACH_STAGE_LIST = list(map(int, num_ops_in_each_stage))
< 
<     global _OPS_START_INDEX_LIST
<     global _OPS_END_INDEX_LIST
<     start_index = 0
<     start_index_list = []
<     end_index_list = []
<     for i in range(len(_NUM_OPS_IN_EACH_STAGE_LIST)):
<         start_index_list.append(start_index)
<         start_index += _NUM_OPS_IN_EACH_STAGE_LIST[i]
<         end_index_list.append(start_index)
<     _OPS_START_INDEX_LIST = start_index_list
<     _OPS_END_INDEX_LIST = end_index_list
< 
<     global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
<     pipeline_model_parallel_size = len(_NUM_OPS_IN_EACH_STAGE_LIST)
<     _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = pipeline_model_parallel_size
---
>     rank = torch.distributed.get_rank()
142c106,107
<     global _DATA_PARALLEL_GROUP, _DATA_PARALLEL_RANKS
---
>     # Build the data-parallel groups.
>     global _DATA_PARALLEL_GROUP
144,147c109,110
<         'data parallel group is already initialized'    
< 
<     _DATA_PARALLEL_GROUP = []
<     _DATA_PARALLEL_RANKS = []
---
>         'data parallel group is already initialized'
>     all_data_parallel_group_ranks = []
149,163c112,131
<         start_rank = 0
<         for ii in range(0, i):
<             STAGE_TP_SIZE = _TP_SIZE_PER_OP[_OPS_START_INDEX_LIST[ii]]
<             STAGE_DP_SIZE = _DP_SIZE_PER_OP[_OPS_START_INDEX_LIST[ii]]
<             start_rank += STAGE_TP_SIZE * STAGE_DP_SIZE
<         end_rank = start_rank + _TP_SIZE_PER_OP[_OPS_START_INDEX_LIST[i]] * _DP_SIZE_PER_OP[_OPS_START_INDEX_LIST[i]]
<         for op_index in range(_OPS_START_INDEX_LIST[i], _OPS_END_INDEX_LIST[i]):
<             OP_TP_SIZE = _TP_SIZE_PER_OP[op_index]
<             OP_DP_SIZE = _DP_SIZE_PER_OP[op_index]
<             for j in range(OP_TP_SIZE):
<                 ranks = range(start_rank + j, end_rank, OP_TP_SIZE)
<                 group = get_group(ranks)
<                 if rank in ranks:
<                     _DATA_PARALLEL_GROUP.append(group)
<                     _DATA_PARALLEL_RANKS.append(ranks)
---
>         start_rank = i * num_pipeline_model_parallel_groups
>         end_rank = (i + 1) * num_pipeline_model_parallel_groups
>         for j in range(tensor_model_parallel_size):
>             ranks = range(start_rank + j, end_rank,
>                           tensor_model_parallel_size)
>             all_data_parallel_group_ranks.append(list(ranks))
>             group = torch.distributed.new_group(ranks)
>             if rank in ranks:
>                 _DATA_PARALLEL_GROUP = group
> 
>     # Build the model-parallel groups.
>     global _MODEL_PARALLEL_GROUP
>     assert _MODEL_PARALLEL_GROUP is None, \
>         'model parallel group is already initialized'
>     for i in range(data_parallel_size):
>         ranks = [data_parallel_group_ranks[i]
>                  for data_parallel_group_ranks in all_data_parallel_group_ranks]
>         group = torch.distributed.new_group(ranks)
>         if rank in ranks:
>             _MODEL_PARALLEL_GROUP = group
166c134
<     global _TENSOR_MODEL_PARALLEL_GROUP, _TENSOR_MODEL_PARALLEL_RANKS
---
>     global _TENSOR_MODEL_PARALLEL_GROUP
169,194c137,140
<     _TENSOR_MODEL_PARALLEL_GROUP = []
<     _TENSOR_MODEL_PARALLEL_RANKS = []
<     for i in range(pipeline_model_parallel_size):
<         start_rank = 0
<         for ii in range(i):
<             STAGE_TP_SIZE = _TP_SIZE_PER_OP[_OPS_START_INDEX_LIST[ii]]
<             STAGE_DP_SIZE = _DP_SIZE_PER_OP[_OPS_START_INDEX_LIST[ii]]
<             start_rank += STAGE_TP_SIZE * STAGE_DP_SIZE
<         for op_index in range(_OPS_START_INDEX_LIST[i], _OPS_END_INDEX_LIST[i]):
<             OP_TP_SIZE = _TP_SIZE_PER_OP[op_index]
<             OP_DP_SIZE = _DP_SIZE_PER_OP[op_index]
<             for j in range(OP_DP_SIZE):
<                 ranks = range(start_rank + j * OP_TP_SIZE, start_rank + (j+1) * OP_TP_SIZE)
<                 group = get_group(ranks)
<                 if rank in ranks:
<                     _TENSOR_MODEL_PARALLEL_GROUP.append(group)
<                     _TENSOR_MODEL_PARALLEL_RANKS.append(ranks)
< 
<     global _MPU_PIPELINE_MODEL_PARALLEL_RANK
<     ranks_in_each_pipe_stage = []
<     start_rank = 0
<     for i in range(pipeline_model_parallel_size):
<         STAGE_TP_SIZE = _TP_SIZE_PER_OP[_OPS_START_INDEX_LIST[i]]
<         STAGE_DP_SIZE = _DP_SIZE_PER_OP[_OPS_START_INDEX_LIST[i]]
<         end_rank = start_rank + STAGE_TP_SIZE * STAGE_DP_SIZE  
<         ranks = [j for j in range(start_rank, end_rank)]
---
>     for i in range(num_tensor_model_parallel_groups):
>         ranks = range(i * tensor_model_parallel_size,
>                       (i + 1) * tensor_model_parallel_size)
>         group = torch.distributed.new_group(ranks)
196,202c142
<             _MPU_PIPELINE_MODEL_PARALLEL_RANK = i
<         ranks_in_each_pipe_stage.append(ranks)
<         start_rank = end_rank
< 
<     # store child ranks and parent ranks for each rank
<     child_ranks = [[] for _ in range(world_size)]
<     parent_ranks = [[] for _ in range(world_size)]
---
>             _TENSOR_MODEL_PARALLEL_GROUP = group
204,207c144,163
<     stage_start_rank = 0
<     for i in range(pipeline_model_parallel_size):
<         if i != (pipeline_model_parallel_size -1):
<             next_i = i + 1
---
>     # Build the pipeline model-parallel groups and embedding groups
>     # (first and last rank in each pipeline model-parallel group).
>     global _PIPELINE_MODEL_PARALLEL_GROUP
>     global _PIPELINE_GLOBAL_RANKS
>     assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
>         'pipeline model parallel group is already initialized'
>     global _EMBEDDING_GROUP
>     assert _EMBEDDING_GROUP is None, \
>         'embedding group is already initialized'
>     for i in range(num_pipeline_model_parallel_groups):
>         ranks = range(i, world_size,
>                       num_pipeline_model_parallel_groups)
>         group = torch.distributed.new_group(ranks)
>         if rank in ranks:
>             _PIPELINE_MODEL_PARALLEL_GROUP = group
>             _PIPELINE_GLOBAL_RANKS = ranks
>         # Setup embedding group (to exchange gradients between
>         # first and last stages).
>         if len(ranks) > 1:
>             embedding_ranks = [ranks[0], ranks[-1]]
209,250c165,168
<             next_i = 0    
<         tp_size = _TP_SIZE_PER_OP[_OPS_END_INDEX_LIST[i]-1]
<         dp_size = _DP_SIZE_PER_OP[_OPS_END_INDEX_LIST[i]-1]
<         tp_size_next = _TP_SIZE_PER_OP[_OPS_START_INDEX_LIST[next_i]]
<         dp_size_next = _DP_SIZE_PER_OP[_OPS_START_INDEX_LIST[next_i]]
< 
<         for j in range(len(ranks_in_each_pipe_stage[i])):
<             current_rank = ranks_in_each_pipe_stage[i][j]
<             dp_id = j // tp_size
<             tp_id = j % tp_size
< 
<             next_dp_id = [dp_id]
<             next_tp_id = [tp_id]
< 
<             if tp_size_next > tp_size:
<                 ensure_divisibility(tp_size_next, tp_size)
<                 ratio = tp_size_next // tp_size
<                 next_tp_id = range(tp_id * ratio, (tp_id + 1)*ratio)
<             if tp_size_next < tp_size:
<                 ensure_divisibility(tp_size, tp_size_next)
<                 ratio = tp_size // tp_size_next
<                 next_tp_id = [tp_id // ratio]          
<             if dp_size_next > dp_size:
<                 ensure_divisibility(dp_size_next, dp_size)
<                 ratio = dp_size_next // dp_size
<                 next_dp_id = range(dp_id * ratio, (dp_id + 1)*ratio)
<             if dp_size_next < dp_size:
<                 ensure_divisibility(dp_size, dp_size_next)
<                 ratio = dp_size // dp_size_next
<                 next_dp_id = [dp_id // ratio]           
< 
<             child_rank_list = []
<             if next_i != 0:
<                 next_stage_start_index = stage_start_rank + len(ranks_in_each_pipe_stage[i])
<             else:
<                 next_stage_start_index = 0
<             for _dp_id in next_dp_id:
<                 for _tp_id in next_tp_id:
<                     child_rank_list.append(next_stage_start_index + _dp_id * tp_size_next + _tp_id)
<             child_ranks[current_rank] = child_rank_list
<         
<         stage_start_rank += len(ranks_in_each_pipe_stage[i])
---
>             embedding_ranks = ranks
>         group = torch.distributed.new_group(embedding_ranks)
>         if rank in embedding_ranks:
>             _EMBEDDING_GROUP = group
252,295d169
<     for i in range(pipeline_model_parallel_size):
<         for j in range(len(ranks_in_each_pipe_stage[i])):
<             current_rank = ranks_in_each_pipe_stage[i][j]
<             for child_rank in child_ranks[current_rank]:
<                 parent_ranks[child_rank].append(current_rank)
< 
<     global _CHILD_RANKS
<     global _PARENT_RANKS
< 
<     _CHILD_RANKS = child_ranks
<     _PARENT_RANKS = parent_ranks
< 
<     global _FLEXPIPE_PREV_RANKS
<     global _FLEXPIPE_NEXT_RANKS
< 
<     _FLEXPIPE_PREV_RANKS = parent_ranks[rank]
<     _FLEXPIPE_NEXT_RANKS = child_ranks[rank]
< 
<     global _RANKS_IN_EACH_PIPELINE_STAGE
<     _RANKS_IN_EACH_PIPELINE_STAGE = ranks_in_each_pipe_stage
< 
<     global _OP_RESHARDING_RANKS
<     _OP_RESHARDING_RANKS = [None for _ in range(sum(_NUM_OPS_IN_EACH_STAGE_LIST))]
< 
<     ## fix: workaround for the group issue:
<     if world_size >= 2:
<         for i in range(0, world_size, 2):
<             ranks = range(i, i+2)
<             get_group(ranks)
< 
<     if world_size >= 4:
<         for i in range(0, world_size, 4):
<             ranks = range(i, i+4)
<             get_group(ranks)    
< 
<     print(f'[DEBUG]|rank {torch.distributed.get_rank()}| \
<     pipeline_rank= {get_pipeline_model_parallel_rank()} | \
<     tp_size= {get_tensor_model_parallel_world_size()} | \
<     tp_rank={get_tensor_model_parallel_rank()} | \
<     tp_src_rank={get_tensor_model_parallel_src_rank()} | \
<     dp_size= {get_data_parallel_world_size()} | \
<     parent ranks={get_stage_comm_recv_ranks()} | \
<     child ranks = {get_stage_comm_send_ranks()} | \
<     args.micro_batch_size = {args.micro_batch_size}\n')
304a179
> 
307,310c182,185
<     return None
<     # assert _MODEL_PARALLEL_GROUP is not None, \
<     #     'model parallel group is not initialized'
<     # return _MODEL_PARALLEL_GROUP
---
>     assert _MODEL_PARALLEL_GROUP is not None, \
>         'model parallel group is not initialized'
>     return _MODEL_PARALLEL_GROUP
> 
312c187
< def get_tensor_model_parallel_group(op_index=None):
---
> def get_tensor_model_parallel_group():
316,319c191,192
<     args = get_args()
<     if op_index is None:
<         op_index = _OPS_START_INDEX_LIST[get_pipeline_model_parallel_rank()]
<     return get_tensor_model_parallel_group_via_op_index(op_index)
---
>     return _TENSOR_MODEL_PARALLEL_GROUP
> 
323,326c196,199
<     # assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
<     #     'pipeline_model parallel group is not initialized'
<     # return _PIPELINE_MODEL_PARALLEL_GROUP
<     return None
---
>     assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
>         'pipeline_model parallel group is not initialized'
>     return _PIPELINE_MODEL_PARALLEL_GROUP
> 
328c201
< def get_data_parallel_group(op_index=None):
---
> def get_data_parallel_group():
332,335c205,213
<     args = get_args()
<     if op_index is None:
<         op_index = _OPS_START_INDEX_LIST[get_pipeline_model_parallel_rank()]
<     return get_data_parallel_group_via_op_index(op_index)    
---
>     return _DATA_PARALLEL_GROUP
> 
> 
> def get_embedding_group():
>     """Get the embedding group the caller rank belongs to."""
>     assert _EMBEDDING_GROUP is not None, \
>         'embedding group is not initialized'
>     return _EMBEDDING_GROUP
> 
341a220
> 
346a226
> 
353a234
> 
357,358c238,241
<     assert _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None
<     return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
---
>     if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
>         return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
>     return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
> 
364a248
> 
369a254
> 
376a262
> 
380,381c266,269
<     assert _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None
<     return _MPU_PIPELINE_MODEL_PARALLEL_RANK
---
>     if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
>         return _MPU_PIPELINE_MODEL_PARALLEL_RANK
>     return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
> 
390a279
> 
402a292
> 
407a298
> 
412a304
> 
417a310
> 
422c315,345
<     return global_rank - get_tensor_model_parallel_rank()
---
>     local_world_size = get_tensor_model_parallel_world_size()
>     return (global_rank // local_world_size) * local_world_size
> 
> 
> def get_pipeline_model_parallel_first_rank():
>     assert _PIPELINE_GLOBAL_RANKS is not None, \
>         "Pipeline parallel group is not initialized"
>     return _PIPELINE_GLOBAL_RANKS[0]
> 
> 
> def get_pipeline_model_parallel_last_rank():
>     assert _PIPELINE_GLOBAL_RANKS is not None, \
>         "Pipeline parallel group is not initialized"
>     last_rank_local = get_pipeline_model_parallel_world_size() - 1
>     return _PIPELINE_GLOBAL_RANKS[last_rank_local]
> 
> def get_pipeline_model_parallel_next_rank():
>     assert _PIPELINE_GLOBAL_RANKS is not None, \
>         "Pipeline parallel group is not initialized"
>     rank_in_pipeline = get_pipeline_model_parallel_rank()
>     world_size = get_pipeline_model_parallel_world_size()
>     return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
> 
> 
> def get_pipeline_model_parallel_prev_rank():
>     assert _PIPELINE_GLOBAL_RANKS is not None, \
>         "Pipeline parallel group is not initialized"
>     rank_in_pipeline = get_pipeline_model_parallel_rank()
>     world_size = get_pipeline_model_parallel_world_size()
>     return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
> 
427a351
> 
431a356
> 
440,629d364
< 
< def get_stage_comm_recv_ranks():
<     assert _FLEXPIPE_PREV_RANKS is not None, \
<         "_FLEXPIPE_PREV_RANKS is not initialized"    
<     return _FLEXPIPE_PREV_RANKS
< 
< def get_stage_comm_send_ranks():
<     assert _FLEXPIPE_NEXT_RANKS is not None, \
<         "_FLEXPIPE_NEXT_RANKS is not initialized"    
<     return _FLEXPIPE_NEXT_RANKS
< 
< def get_op_start_index(rank_in_pipeline, model_chunk_id=0):
<     assert _OPS_START_INDEX_LIST is not None, \
<         "_OPS_START_INDEX_LIST is not initialized"    
<     num_pipeline_stages = len(_NUM_OPS_IN_EACH_STAGE_LIST)
<     return _OPS_START_INDEX_LIST[rank_in_pipeline + model_chunk_id * num_pipeline_stages]    
< 
< def get_op_end_index(rank_in_pipeline, model_chunk_id=0):
<     assert _OPS_END_INDEX_LIST is not None, \
<         "_OPS_END_INDEX_LIST is not initialized"    
<     num_pipeline_stages = len(_NUM_OPS_IN_EACH_STAGE_LIST)     
<     return _OPS_END_INDEX_LIST[rank_in_pipeline + model_chunk_id * num_pipeline_stages]    
< 
< def get_num_ops_list():
<     assert _NUM_OPS_IN_EACH_STAGE_LIST is not None, \
<         "_NUM_OPS_IN_EACH_STAGE_LIST is not initialized"
<     return _NUM_OPS_IN_EACH_STAGE_LIST
< 
< def set_virtual_pipeline_next_forward_model_rank(model_chunk_id):
<     global _VIRTUAL_PIPELINE_NEXT_FORWARD_MODEL_PARALLEL_RANK
<     _VIRTUAL_PIPELINE_NEXT_FORWARD_MODEL_PARALLEL_RANK = model_chunk_id
< 
< def set_virtual_pipeline_next_backward_model_rank(model_chunk_id):
<     global _VIRTUAL_PIPELINE_NEXT_BACKWARD_MODEL_PARALLEL_RANK
<     _VIRTUAL_PIPELINE_NEXT_BACKWARD_MODEL_PARALLEL_RANK = model_chunk_id
< 
< def get_virtual_pipeline_next_forward_model_rank():
<     if _VIRTUAL_PIPELINE_NEXT_FORWARD_MODEL_PARALLEL_RANK is not None:
<         return _VIRTUAL_PIPELINE_NEXT_FORWARD_MODEL_PARALLEL_RANK
<     else:
<         return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
< 
< def get_virtual_pipeline_next_backward_model_rank():
<     if _VIRTUAL_PIPELINE_NEXT_BACKWARD_MODEL_PARALLEL_RANK is not None:
<         return _VIRTUAL_PIPELINE_NEXT_BACKWARD_MODEL_PARALLEL_RANK
<     else:
<         return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
< 
< def set_virtual_pipeline_backward_model_parallel_rank(model_chunk_id):
<     global _VIRTUAL_PIPELINE_BACKWARD_MODEL_PARALLEL_RANK
<     _VIRTUAL_PIPELINE_BACKWARD_MODEL_PARALLEL_RANK = model_chunk_id
< 
< def get_virtual_pipeline_backward_model_parallel_rank():
<     if _VIRTUAL_PIPELINE_BACKWARD_MODEL_PARALLEL_RANK is not None:
<         return _VIRTUAL_PIPELINE_BACKWARD_MODEL_PARALLEL_RANK
<     else:
<         return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
< 
< def get_pipeline_rank_via_op_index(op_index):
<     global _NUM_OPS_IN_EACH_STAGE_LIST
<     sum = 0
<     for i in range(len(_NUM_OPS_IN_EACH_STAGE_LIST)):
<         sum += _NUM_OPS_IN_EACH_STAGE_LIST[i]
<         if sum > op_index:
<             return  i % len(_NUM_OPS_IN_EACH_STAGE_LIST)
< 
< def get_ranks_via_pipeline_stage(pipeline_stage):
<     return _RANKS_IN_EACH_PIPELINE_STAGE[pipeline_stage]
< 
< def get_next_pipeline_model_parallel_rank():
<     """Return my rank for the pipeline model parallel group."""
<     if is_pipeline_last_stage():
<         return 0
<     else:
<         return get_pipeline_model_parallel_rank() + 1
< 
< def get_prev_pipeline_model_parallel_rank():
<     """Return my rank for the pipeline model parallel group."""
<     if is_pipeline_first_stage():
<         return get_pipeline_model_parallel_world_size() - 1
<     else:
<         return get_pipeline_model_parallel_rank() - 1
< 
< def set_comm_info(bwd_send_info, fwd_recv_info, fwd_send_info, bwd_recv_info):
<     global _BWD_SEND_INFO, _FWD_RECV_INFO, _FWD_SEND_INFO, _BWD_RECV_INFO
<     _BWD_SEND_INFO = bwd_send_info
<     _FWD_RECV_INFO = fwd_recv_info
<     _FWD_SEND_INFO = fwd_send_info
<     _BWD_RECV_INFO = bwd_recv_info
< 
< def get_recv_info(forward):
<     global _FWD_RECV_INFO, _BWD_RECV_INFO
<     if forward:
<         return _FWD_RECV_INFO
<     else:
<         return _BWD_RECV_INFO
< 
< def get_send_info(forward):
<     global _FWD_SEND_INFO, _BWD_SEND_INFO
<     if forward:
<         return _FWD_SEND_INFO
<     else:
<         return _BWD_SEND_INFO
< 
< def bitmap(ranks):
<     """
<     (from Zhiqi's codebase)
<     map the rank list to the bit map string
<     """
<     bits = '0' * torch.distributed.get_world_size()
<     for rank in ranks:
<         if rank >= len(bits):
<             raise ValueError("rank {} out of range ({})".format(rank, len(bits)))
<         bits = bits[0:rank] + '1' + bits[rank+1:]
<     return bits
< 
< def get_group(ranks):
<     group_bits = bitmap(ranks)
<     if group_bits not in all_groups: 
<         all_groups[group_bits] = torch.distributed.new_group(list(ranks))       
< 
<     return all_groups[group_bits] 
< 
< def get_op_tp_size(op_index):
<     return _TP_SIZE_PER_OP[op_index]
< 
< def get_op_dp_size(op_index):
<     assert op_index < len(_DP_SIZE_PER_OP), f"op index {op_index} out of range({len(_DP_SIZE_PER_OP)})."
<     return _DP_SIZE_PER_OP[op_index]
< 
< def set_resharding_group(devices):
<     global _RESHARDING_GROUP
<     _RESHARDING_GROUP = devices 
< 
< def get_resharding_group():
<     global _RESHARDING_GROUP
<     assert _RESHARDING_GROUP is not None
<     return _RESHARDING_GROUP
< 
< def set_resharding_rank(rank):
<     global _RESHARDING_RANK
<     _RESHARDING_RANK = rank 
< 
< def get_resharding_rank():
<     global _RESHARDING_RANK
<     assert _RESHARDING_RANK is not None
<     return _RESHARDING_RANK
< 
< def set_resharding_dim(dim):
<     global _RESHARDING_DIM
<     _RESHARDING_DIM = dim 
< 
< def get_resharding_dim():
<     global _RESHARDING_DIM
<     assert _RESHARDING_DIM is not None
<     return _RESHARDING_DIM
< 
< def get_data_parallel_group_via_op_index(op_index):
<     assert _DATA_PARALLEL_GROUP is not None, \
<         'data parallel group is not initialized'
<     pp_stage = get_pipeline_model_parallel_rank()
<     start_op_index = _OPS_START_INDEX_LIST[pp_stage]
<     return _DATA_PARALLEL_GROUP[op_index - start_op_index]
< 
< def get_tensor_model_parallel_group_via_op_index(op_index):
<     assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
<         'tensor model parallel group is not initialized'
<     pp_stage = get_pipeline_model_parallel_rank()
<     start_op_index = _OPS_START_INDEX_LIST[pp_stage]    
<     return _TENSOR_MODEL_PARALLEL_GROUP[op_index - start_op_index]
< 
< def set_op_resharding_ranks(op_index, ranks):
<     _OP_RESHARDING_RANKS[op_index] = ranks 
< 
< def get_op_resharding_ranks(op_index):
<     return _OP_RESHARDING_RANKS[op_index]
< 
< def get_tensor_model_parallel_ranks_via_op_index(op_index):
<     assert _TENSOR_MODEL_PARALLEL_RANKS is not None, \
<         'tensor model parallel group is not initialized'
<     pp_stage = get_pipeline_model_parallel_rank()
<     start_op_index = _OPS_START_INDEX_LIST[pp_stage]    
<     return _TENSOR_MODEL_PARALLEL_RANKS[op_index - start_op_index]    
< 
< def get_data_parallel_ranks_via_op_index(op_index):
<     assert _DATA_PARALLEL_RANKS is not None, \
<         'tensor model parallel group is not initialized'
<     pp_stage = get_pipeline_model_parallel_rank()
<     start_op_index = _OPS_START_INDEX_LIST[pp_stage]    
<     return _DATA_PARALLEL_RANKS[op_index - start_op_index]        
diff --color -r runtime/megatron/mpu/__init__.py ../Megatron-LM-base/megatron/mpu/__init__.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/mpu/__init__.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
34a27
> from .initialize import get_embedding_group
41a35,38
> from .initialize import get_pipeline_model_parallel_first_rank
> from .initialize import get_pipeline_model_parallel_last_rank
> from .initialize import get_pipeline_model_parallel_next_rank
> from .initialize import get_pipeline_model_parallel_prev_rank
44a42
> from .initialize import initialize_model_parallel
53,61c51
< 
< from .layers import NewColumnParallelLinear     
< from .layers import NewRowParallelLinear    
< from .layers import NewVocabParallelEmbedding                  
< from .cross_entropy import new_vocab_parallel_cross_entropy
< from .mappings import _PrimReplicate
< from .mappings import _PrimAllGather
< 
< 
---
>                      
77,113d66
< 
< from .initialize import initialize_model_parallel_flexpipe
< from .initialize import get_stage_comm_recv_ranks
< from .initialize import get_stage_comm_send_ranks
< from .initialize import get_op_start_index
< from .initialize import get_op_end_index
< from .initialize import get_num_ops_list
< 
< from .initialize import set_virtual_pipeline_next_backward_model_rank
< from .initialize import set_virtual_pipeline_next_forward_model_rank
< from .initialize import get_virtual_pipeline_next_backward_model_rank
< from .initialize import get_virtual_pipeline_next_forward_model_rank
< from .initialize import get_virtual_pipeline_model_parallel_world_size
< from .initialize import set_virtual_pipeline_backward_model_parallel_rank
< from .initialize import get_virtual_pipeline_backward_model_parallel_rank
< from .initialize import get_pipeline_rank_via_op_index
< from .initialize import get_ranks_via_pipeline_stage
< from .initialize import get_next_pipeline_model_parallel_rank
< from .initialize import get_prev_pipeline_model_parallel_rank
< from .initialize import set_comm_info
< from .initialize import get_recv_info
< from .initialize import get_send_info
< from .initialize import get_group
< from .initialize import get_op_dp_size
< from .initialize import get_op_tp_size
< from .initialize import set_resharding_group
< from .initialize import get_resharding_group
< from .initialize import set_resharding_rank
< from .initialize import get_resharding_rank
< from .initialize import set_resharding_dim
< from .initialize import get_resharding_dim
< from .initialize import get_data_parallel_group_via_op_index
< from .initialize import get_tensor_model_parallel_group_via_op_index
< from .initialize import set_op_resharding_ranks
< from .initialize import get_op_resharding_ranks
< from .initialize import get_tensor_model_parallel_ranks_via_op_index
< from .initialize import get_data_parallel_ranks_via_op_index
\ No newline at end of file
diff --color -r runtime/megatron/mpu/layers.py ../Megatron-LM-base/megatron/mpu/layers.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/mpu/layers.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
37c29
< from .initialize import get_tensor_model_parallel_world_size, get_group
---
> from .initialize import get_tensor_model_parallel_world_size
47c39
< from megatron import get_timers
---
> 
52a45
> 
207,280d199
< class NewVocabParallelEmbedding(torch.nn.Module):
<     """Embedding parallelized in the vocabulary dimension.
< 
<     This is mainly adapted from torch.nn.Embedding and all the default
<     values are kept.
<     Arguments:
<         num_embeddings: vocabulary size.
<         embedding_dim: size of hidden state.
<         init_method: method to initialize weights.
<     """
< 
<     def __init__(self, num_embeddings, embedding_dim, tp_group_ranks,
<                  init_method=init.xavier_normal_):
<         super(NewVocabParallelEmbedding, self).__init__()
<         # Keep the input dimensions.
<         self.num_embeddings = num_embeddings
<         self.embedding_dim = embedding_dim
<         # Set the detauls for compatibility.
<         self.padding_idx = None
<         self.max_norm = None
<         self.norm_type = 2.
<         self.scale_grad_by_freq = False
<         self.sparse = False
<         self._weight = None
<         self.tensor_model_parallel_size = len(tp_group_ranks) # get_tensor_model_parallel_world_size()
<         # Divide the weight matrix along the vocaburaly dimension.
<         self.tp_group = get_group(tp_group_ranks)
<         self.tp_group_ranks = tp_group_ranks
<         self.vocab_start_index, self.vocab_end_index = \
<             VocabUtility.vocab_range_from_global_vocab_size(
<                 self.num_embeddings, torch.distributed.get_rank(group=self.tp_group),
<                 self.tensor_model_parallel_size)
<         self.num_embeddings_per_partition = self.vocab_end_index - \
<             self.vocab_start_index
< 
<         # Allocate weights and initialize.
<         args = get_args()
<         if args.use_cpu_initialization:
<             self.weight = Parameter(torch.empty(
<                 self.num_embeddings_per_partition, self.embedding_dim,
<                 dtype=args.params_dtype))
<             _initialize_affine_weight_cpu(
<                 self.weight, self.num_embeddings, self.embedding_dim,
<                 self.num_embeddings_per_partition, 0, init_method)
<         else:
<             self.weight = Parameter(torch.empty(
<                 self.num_embeddings_per_partition, self.embedding_dim,
<                 device=torch.cuda.current_device(), dtype=args.params_dtype))
<             _initialize_affine_weight_gpu(self.weight, init_method,
<                                           partition_dim=0, stride=1)
< 
<     def forward(self, input_):
<         if self.tensor_model_parallel_size > 1:
<             # Build the mask.
<             input_mask = (input_ < self.vocab_start_index) | \
<                          (input_ >= self.vocab_end_index)
<             # Mask the input.
<             masked_input = input_.clone() - self.vocab_start_index
<             masked_input[input_mask] = 0
<         else:
<             masked_input = input_
<             # Get the embeddings.
<         output_parallel = F.embedding(masked_input, self.weight,
<                                       self.padding_idx, self.max_norm,
<                                       self.norm_type, self.scale_grad_by_freq,
<                                       self.sparse)
<         # Mask the output embedding.
<         if self.tensor_model_parallel_size > 1:
<             output_parallel[input_mask, :] = 0.0
<         # Reduce across all the model parallel GPUs.
<         # output = reduce_from_tensor_model_parallel_region(output_parallel)
<         output = prim_all_reduce(output_parallel, self.tp_group_ranks)
<         return output
< 
356a276,277
> 
> 
407c328
<                  skip_bias_add=False, name=""):
---
>                  skip_bias_add=False):
409c330
<         self.name = name
---
> 
451a373,374
> 
> 
462,593d384
<         if not self.skip_bias_add:
<             output = output_ + self.bias if self.bias is not None else output_
<             output_bias = None
<         else:
<             output = output_
<             output_bias = self.bias
<         return output, output_bias
< 
< """
< The NewColumnParallelLinear and NewRowParallelLinear layers are designed for 
< supporting resharding between layers. These two layers do not include communications, 
< we put the needed communication outside the layers.
< """
< 
< class NewColumnParallelLinear(torch.nn.Module):
< 
<     def __init__(self, input_size, output_size, tp_size, bias=True, gather_output=True,
<                  init_method=init.xavier_normal_, stride=1,
<                  keep_master_weight_for_test=False,
<                  skip_bias_add=False):
<         super(NewColumnParallelLinear, self).__init__()
< 
<         # Keep input parameters
<         self.input_size = input_size
<         self.output_size = output_size
<         self.gather_output = gather_output
<         # Divide the weight matrix along the last dimension.
<         # world_size = get_tensor_model_parallel_world_size()
<         self.output_size_per_partition = divide(output_size, tp_size)
<         self.skip_bias_add = skip_bias_add
< 
<         # Parameters.
<         # Note: torch.nn.functional.linear performs XA^T + b and as a result
<         # we allocate the transpose.
<         # Initialize weight.
<         args = get_args()
<         if args.use_cpu_initialization:
<             self.weight = Parameter(torch.empty(self.output_size_per_partition,
<                                                 self.input_size,
<                                                 dtype=args.params_dtype))
<             self.master_weight = _initialize_affine_weight_cpu(
<                 self.weight, self.output_size, self.input_size,
<                 self.output_size_per_partition, 0, init_method,
<                 stride=stride, return_master_weight=keep_master_weight_for_test)
<         else:
<             self.weight = Parameter(torch.empty(
<                 self.output_size_per_partition, self.input_size,
<                 device=torch.cuda.current_device(), dtype=args.params_dtype))
<             _initialize_affine_weight_gpu(self.weight, init_method,
<                                           partition_dim=0, stride=stride)
<             
<         if bias:
<             if args.use_cpu_initialization:
<                 self.bias = Parameter(torch.empty(
<                     self.output_size_per_partition, dtype=args.params_dtype))
<             else:
<                 self.bias = Parameter(torch.empty(
<                     self.output_size_per_partition,
<                     device=torch.cuda.current_device(),
<                     dtype=args.params_dtype))
<             set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
<             # Always initialize bias to zero.
<             with torch.no_grad():
<                 self.bias.zero_()
<         else:
<             self.register_parameter('bias', None)
< 
<     def forward(self, input_):
<         # Matrix multiply.
<         bias = self.bias if not self.skip_bias_add else None
<         output_parallel = F.linear(input_, self.weight, bias)
<         if self.gather_output:
<             # All-gather across the partitions.
<             output = gather_from_tensor_model_parallel_region(output_parallel)
<         else:
<             output = output_parallel 
<         output_bias = self.bias if self.skip_bias_add else None
<         return output, output_bias
< 
< 
< class NewRowParallelLinear(torch.nn.Module):
< 
<     def __init__(self, input_size, output_size, tp_size, bias=True,
<                  init_method=init.xavier_normal_, stride=1,
<                  keep_master_weight_for_test=False,
<                  skip_bias_add=False, name=""):
<         super(NewRowParallelLinear, self).__init__()
<         self.name = name
<         # Keep input parameters
<         self.input_size = input_size
<         self.output_size = output_size
<         # Divide the weight matrix along the last dimension.
<         # world_size = get_tensor_model_parallel_world_size()
<         self.input_size_per_partition = divide(input_size, tp_size)
<         self.skip_bias_add = skip_bias_add
< 
<         # Parameters.
<         # Note: torch.nn.functional.linear performs XA^T + b and as a result
<         # we allocate the transpose.
<         # Initialize weight.
<         args = get_args()
<         if args.use_cpu_initialization:
<             self.weight = Parameter(torch.empty(self.output_size,
<                                                 self.input_size_per_partition,
<                                                 dtype=args.params_dtype))
<             self.master_weight = _initialize_affine_weight_cpu(
<                 self.weight, self.output_size, self.input_size,
<                 self.input_size_per_partition, 1, init_method,
<                 stride=stride, return_master_weight=keep_master_weight_for_test)
<         else:
<             self.weight = Parameter(torch.empty(
<                 self.output_size, self.input_size_per_partition,
<                 device=torch.cuda.current_device(), dtype=args.params_dtype))
<             _initialize_affine_weight_gpu(self.weight, init_method,
<                                           partition_dim=1, stride=stride)
<         if bias:
<             if args.use_cpu_initialization:
<                 self.bias = Parameter(torch.empty(self.output_size,
<                                                   dtype=args.params_dtype))
<             else:
<                 self.bias = Parameter(torch.empty(
<                     self.output_size, device=torch.cuda.current_device(),
<                     dtype=args.params_dtype))
<             # Always initialize bias to zero.
<             with torch.no_grad():
<                 self.bias.zero_()
<         else:
<             self.register_parameter('bias', None)
< 
<     def forward(self, input_):
<         # Matrix multiply.
<         output_ = F.linear(input_, self.weight)
diff --color -r runtime/megatron/mpu/mappings.py ../Megatron-LM-base/megatron/mpu/mappings.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/mpu/mappings.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
26,28c18,20
< from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank, get_group, set_resharding_group, get_resharding_group, set_resharding_dim, get_resharding_dim, set_resharding_rank, get_resharding_rank, get_op_resharding_ranks, set_op_resharding_ranks, get_ranks_via_pipeline_stage, get_pipeline_model_parallel_rank
< from .utils import split_tensor_along_last_dim, divide
< import numpy as np
---
> from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
> from .utils import split_tensor_along_last_dim
> 
147,468d138
< #### Aceso:
< 
< def new_split(input_, ranks, dim):
< 
<     if dim == -1:
<         dim = input_.dim() - 1
< 
<     dim_size = divide(input_.size()[dim], len(ranks))
<     tensor_list = torch.split(input_, dim_size, dim=dim)
<     tensor_list = tuple(chunk.contiguous() for chunk in tensor_list)   
<     return tensor_list[torch.distributed.get_rank(get_group(ranks))].contiguous()
< 
< def new_all_gather(input_, ranks, dim):
<     if dim == -1:
<         dim = input_.dim() - 1
<     
<     if not input_.is_contiguous():
<         input_ = input_.contiguous()
<     tensor_list = [torch.empty_like(input_) for _ in ranks]
< 
<     torch.distributed.all_gather(tensor_list, input_, group=get_group(ranks))
<     torch.cuda.synchronize()
< 
<     # concat
<     new_input_ = torch.cat(tensor_list, dim=dim).contiguous().requires_grad_()
< 
<     return new_input_    
< 
< def new_reduce(input_, ranks):
<     """All-reduce the the input tensor across model parallel group."""
< 
<     # Bypass the function if we are using only 1 GPU.
<     if len(ranks)==1:
<         return input_
< 
<     # All-reduce.
<     torch.distributed.all_reduce(input_, group=get_group(ranks))
<     torch.cuda.synchronize()
< 
<     return input_
< 
< def new_reduce_scatter(input_, ranks, dim):
< 
<     input_list = list(input_.chunk(len(ranks), dim))
<     for idx, tensor in enumerate(input_list):
<         if not tensor.is_contiguous():
<             input_list[idx] = tensor.contiguous()
<     new_input_ = torch.empty_like(input_list[0], requires_grad=True)
<     torch.distributed.reduce_scatter(new_input_, input_list, group=get_group(ranks))
<     torch.cuda.synchronize()
<     return new_input_
< 
< def new_all_to_all(input_, ranks, src_dim, dst_dim):
< 
<     input_list = list(input_.chunk(len(ranks), dim=dst_dim))
<     for idx, tensor in enumerate(input_list):
<         if not tensor.is_contiguous():
<             input_list[idx] = tensor.contiguous()
<     new_input_list = [torch.empty_like(t) for t in input_list]
<     torch.distributed.all_to_all(new_input_list, input_list, group=get_group(ranks))
<     torch.cuda.synchronize()
<     new_input_ = torch.concat(tuple(new_input_list), dim=src_dim).requires_grad_()
< 
<     return new_input_    
< 
< class _PrimSplit(torch.autograd.Function):
<     
<     @staticmethod
<     def forward(ctx, input_):
<         # print(f"[DEBUG] fwd: split")
<         ctx.ranks = get_resharding_group() 
<         ctx.dim = get_resharding_dim()
<         return new_split(input_, ctx.ranks, ctx.dim)
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         # print(f"[DEBUG] bwd: all-gather")
<         ranks = ctx.ranks
<         dim = ctx.dim
<         return new_all_gather(grad_output, ranks, dim)
< 
< class _PrimAllGather(torch.autograd.Function):
<     
<     @staticmethod
<     def forward(ctx, input_):
<         # print(f"[DEBUG] fwd: all-gather")
<         ctx.ranks = get_resharding_group() 
<         ctx.dim = get_resharding_dim()
<         return new_all_gather(input_, ctx.ranks, ctx.dim)
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         # print(f"[DEBUG] bwd: split")
<         ranks = ctx.ranks
<         dim = ctx.dim
<         return new_split(grad_output, ranks, dim)
< 
< class _PrimAllReduce(torch.autograd.Function):
< 
<     @staticmethod
<     def forward(ctx, input_):
<         # print(f"[DEBUG] fwd: all-reduce")
<         ranks = get_resharding_group()
<         return new_reduce(input_, ranks)
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         # print(f"[DEBUG] bwd: None")
<         return grad_output
< 
< class _PrimReduceScatter(torch.autograd.Function):
< 
<     @staticmethod
<     def forward(ctx, input_):
<         # print(f"[DEBUG] fwd: reduce-scatter")
<         ctx.ranks = get_resharding_group()
<         ctx.dim = get_resharding_dim()
<         return new_reduce_scatter(input_, ctx.ranks, ctx.dim)
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         # print(f"[DEBUG] bwd: all-gather")
<         ranks = ctx.ranks
<         dim = ctx.dim        
<         return new_all_gather(grad_output, ctx.ranks, ctx.dim)
< 
< class _PrimReplicate(torch.autograd.Function):
<     """Pass the input to the model parallel region."""
< 
<     @staticmethod
<     def forward(ctx, input_):
<         # print(f"[DEBUG] fwd: replicate")
<         ctx.ranks = get_resharding_group()
<         return input_
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         # print(f"[DEBUG] bwd: all-reduce")
<         ranks = ctx.ranks
<         return new_reduce(grad_output, ranks)
< 
< class _PrimAlltoAll(torch.autograd.Function):
<     """Pass the input to the model parallel region."""
< 
<     @staticmethod
<     def forward(ctx, input_):
<         ctx.ranks = get_resharding_group()
<         ctx.src_dim = get_resharding_dim()[0]
<         ctx.dst_dim = get_resharding_dim()[1]
<         return new_all_to_all(input_, ctx.ranks, ctx.src_dim, ctx.dst_dim)
< 
<     @staticmethod
<     def backward(ctx, grad_output):
<         ranks = ctx.ranks
<         src_dim = ctx.src_dim
<         dst_dim = ctx.dst_dim
<         return new_all_to_all(grad_output, ranks, dst_dim, src_dim)
< 
< def prim_split(input_, ranks, dim):
<     set_resharding_group(ranks)
<     set_resharding_dim(dim)
<     return _PrimSplit.apply(input_)
< 
< def prim_all_reduce(input_, ranks):
<     set_resharding_group(ranks)
<     return _PrimAllReduce.apply(input_)
< 
< def prim_reduce_scatter(input_, ranks, dim):
<     set_resharding_group(ranks)
<     set_resharding_dim(dim)
<     return _PrimReduceScatter.apply(input_)
< 
< def prim_all_to_all(input_, ranks, src_dim, dst_dim):
<     set_resharding_group(ranks)
<     set_resharding_dim([src_dim, dst_dim])
<     return _PrimAlltoAll.apply(input_)
< 
< def prim_all_gather(input_, ranks, dim):
<     set_resharding_group(ranks)
<     set_resharding_dim(dim)
<     return _PrimAllGather.apply(input_)
< 
< def transpose(mat: np.ndarray, dim0: int, dim1: int, get_reverse=False):
<     """
<     (from Zhiqi's codebase)
<     put the dim0 and dim1 of the mat to the last two dims
<     """
<     ndims = len(mat.shape)
<     axes = list(range(ndims))
<     assert dim0 < ndims and dim1 < ndims, "dim0 or dim1 out of index"
<     axes.pop(max(dim0, dim1))
<     axes.pop(min(dim0, dim1))
<     axes += [dim0, dim1]
< 
<     if get_reverse:
<         reverse_axes = []
<         for original_index in range(ndims):
<             for new_index in axes:
<                 if axes[new_index] == original_index:
<                     reverse_axes.append(new_index)
<         return np.transpose(mat, axes), reverse_axes
<     else:
<         return np.transpose(mat, axes)
< 
< def identical_spec(input_spec, required_spec):
<     identical = True 
<     ## this is used in T5, to pass encoder_output.
<     if len(input_spec) == 0 and len(required_spec) == 0:
<         return identical
< 
<     if input_spec["R"] != required_spec["R"]:
<         identical = False
<     if input_spec["V"] != required_spec["V"]:
<         identical = False    
<     for dim_index in range(len(input_spec["dims"])):
<         if input_spec["dims"][dim_index] != required_spec["dims"][dim_index]:
<             identical = False
<     
<     return identical
< 
< def tensor_adapter_handler(input_dev_mat, init_output_dev_mat, inc_dim, dec_dim, inc_to_size, dec_to_size):
<     trans_in_dev_mat = transpose(input_dev_mat, inc_dim, dec_dim)
<     trans_out_dev_mat, reverse_axes = transpose(init_output_dev_mat, inc_dim, dec_dim, get_reverse=True)
< 
<     for index_r in range(len(trans_in_dev_mat)): 
<         for index_v in range(len(trans_in_dev_mat[index_r])): 
<             for index_d in range(len(trans_in_dev_mat[index_r][index_v])):
<                 tmp_arrays = np.hsplit(trans_in_dev_mat[index_r][index_v][index_d], dec_to_size)
<                 tmp_arrays = [tmp_arrays[i].reshape(inc_to_size, 1) for i in range(len(tmp_arrays))]
<                 new_mat = np.hstack(tmp_arrays)
<                 trans_out_dev_mat[index_r][index_v][index_d] = new_mat
<     output_dev_mat = trans_out_dev_mat.transpose(reverse_axes)   
< 
<     return trans_in_dev_mat, output_dev_mat
< 
< def tensor_adapter(input_, input_spec, required_spec, input_dev_mat):
<     if identical_spec(input_spec, required_spec) or len(required_spec) == 0:
<         return input_, input_dev_mat
< 
<     rank = torch.distributed.get_rank()
<     # init_output_dev_mat = np.array([0 for _ in range(torch.distributed.get_world_size())]).reshape([required_spec["R"], required_spec["V"]] + required_spec["dims"])
<     all_ranks = get_ranks_via_pipeline_stage(get_pipeline_model_parallel_rank())
<     init_output_dev_mat = np.array(all_ranks).reshape([required_spec["R"], required_spec["V"]] + required_spec["dims"])
< 
<     if input_spec["R"] > required_spec["R"]:
<         ## R -> Dim, split
<         for dim_index in range(len(input_spec["dims"])):
<             if input_spec["dims"][dim_index] < required_spec["dims"][dim_index]:
<                 assert input_spec["R"] % required_spec["R"] == 0
< 
<                 trans_in_dev_mat, output_dev_mat = tensor_adapter_handler(
<                     input_dev_mat, init_output_dev_mat, inc_dim=2+dim_index, dec_dim=0, 
<                     inc_to_size=required_spec["dims"][dim_index], dec_to_size=required_spec["R"]
<                 )
<                 num_chunks = input_spec["R"] // required_spec["R"]
< 
<                 for devices in trans_in_dev_mat.reshape(-1, num_chunks):
<                     if rank in devices:
<                         return prim_split(input_, devices, dim_index), output_dev_mat
<                 
<     elif input_spec["V"] > required_spec["V"]:
<         ## V -> R, all-reduce
<         if input_spec["R"] < required_spec["R"]:
<             assert input_spec["V"] % required_spec["V"] == 0
< 
<             trans_in_dev_mat, output_dev_mat = tensor_adapter_handler(
<                 input_dev_mat, init_output_dev_mat, inc_dim=0, dec_dim=1, 
<                 inc_to_size=required_spec["R"], dec_to_size=required_spec["V"]
<             )
<             num_chunks = input_spec["V"] // required_spec["V"]
< 
<             for devices in trans_in_dev_mat.reshape(-1, num_chunks):
<                 if rank in devices:
<                     return prim_all_reduce(input_, devices), output_dev_mat    
<             
<         ## V-> D, reduce-scatter
<         for dim_index in range(len(input_spec["dims"])):
<             if input_spec["dims"][dim_index] < required_spec["dims"][dim_index]:
<                 assert input_spec["V"] % required_spec["V"] == 0
< 
<                 trans_in_dev_mat, output_dev_mat = tensor_adapter_handler(
<                     input_dev_mat, init_output_dev_mat, inc_dim=2+dim_index, dec_dim=1, 
<                     inc_to_size=required_spec["dims"][dim_index], dec_to_size=required_spec["V"]
<                 )
<                 num_chunks = input_spec["V"] // required_spec["V"]
< 
<                 for devices in trans_in_dev_mat.reshape(-1, num_chunks):
<                     if rank in devices:
<                         return prim_reduce_scatter(input_, devices, dim_index), output_dev_mat 
< 
<     else:
<         for src_dim_index in range(len(input_spec["dims"])):
<             if input_spec["dims"][src_dim_index] > required_spec["dims"][src_dim_index]:
<                 ## D -> R, all-gather
<                 if input_spec["R"] < required_spec["R"]:
<                     assert input_spec["dims"][src_dim_index] % required_spec["dims"][src_dim_index] == 0
<                     trans_in_dev_mat, output_dev_mat = tensor_adapter_handler(
<                         input_dev_mat, init_output_dev_mat, inc_dim=0, dec_dim=2+src_dim_index, 
<                         inc_to_size=required_spec["R"], dec_to_size=required_spec["dims"][src_dim_index]
<                     )
<                     num_chunks = input_spec["dims"][src_dim_index] // required_spec["dims"][src_dim_index]
< 
<                     for devices in trans_in_dev_mat.reshape(-1, num_chunks):
<                         if rank in devices:                            
<                             return prim_all_gather(input_, devices, src_dim_index), output_dev_mat  
< 
<                 for dst_dim_index in range(len(input_spec["dims"])):
<                     ## D -> D, all-to-all
<                     if dst_dim_index != src_dim_index and input_spec["dims"][dst_dim_index] < required_spec["dims"][dst_dim_index]:
<                         assert input_spec["dims"][src_dim_index] % required_spec["dims"][src_dim_index] == 0
< 
<                         trans_in_dev_mat, output_dev_mat = tensor_adapter_handler(
<                             input_dev_mat, init_output_dev_mat, inc_dim=2+dst_dim_index, dec_dim=2+src_dim_index, 
<                             inc_to_size=required_spec["dims"][dst_dim_index], dec_to_size=required_spec["dims"][src_dim_index]
<                         )
<                         num_chunks = input_spec["dims"][src_dim_index] // required_spec["dims"][src_dim_index]
< 
<                         for devices in trans_in_dev_mat.reshape(-1, num_chunks):
<                             if rank in devices:
<                                 return prim_all_to_all(input_, devices, src_dim_index, dst_dim_index), output_dev_mat    
< 
<         raise RuntimeError(f"No communication pattern found. input_spec: {input_spec}\nrequired_spec: {required_spec}")
477,480d146
< def copy_to_tensor_model_parallel_region_test(input_):
<     # return _CopyToModelParallelRegion.apply(input_)
<     return input_
< 
492,527d157
< 
< def new_copy_to_tensor_model_parallel_region(op_index, input_, input_spec, input_dev_mat):
<     num_replicates = input_spec["R"]
<     if num_replicates == 1:
<         return input_ 
<     else:
<         op_resharding_ranks = get_op_resharding_ranks(op_index)
<         if op_resharding_ranks is None:
<             rank = torch.distributed.get_rank()
<             trans_in_dev_mat = transpose(input_dev_mat, 1, 0)
<             for ranks in trans_in_dev_mat.reshape(-1, num_replicates):
<                 if rank in ranks:
<                     op_resharding_ranks = ranks 
<                     set_op_resharding_ranks(op_index, ranks)
<         set_resharding_group(op_resharding_ranks)
<         return _PrimReplicate.apply(input_)
<     
<     raise RuntimeError("failed in new_copy_to_tensor_model_parallel_region")
< 
< def new_reduce_from_tensor_model_parallel_region(op_index, input_, input_spec, input_dev_mat):
<     num_replicates = input_spec["R"]
<     if num_replicates == 1:
<         return input_ 
<     else:
<         op_resharding_ranks = get_op_resharding_ranks(op_index)
<         if op_resharding_ranks is None:
<             rank = torch.distributed.get_rank()
<             trans_in_dev_mat = transpose(input_dev_mat, 1, 0)
<             for ranks in trans_in_dev_mat.reshape(-1, num_replicates):
<                 if rank in ranks:
<                     op_resharding_ranks = ranks 
<                     set_op_resharding_ranks(op_index, ranks)                    
<         set_resharding_group(op_resharding_ranks)
<         return _PrimAllReduce.apply(input_)
<     
<     raise RuntimeError("failed in new_reduce_from_tensor_model_parallel_region")    
\ No newline at end of file
diff --color -r runtime/megatron/mpu/random.py ../Megatron-LM-base/megatron/mpu/random.py
27c27
< from megatron import get_args, mpu
---
> from megatron import get_args
266,271d265
<         # if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
<         #     ctx.input_0_shape = args[0].data.shape
<         #     args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
<         #     args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
<         #         args[0].data)
< 
273,276c267,270
<             ctx.inputs_shape = [args[i].data.shape for i in range(len(args))]
<             for i in range(len(args)):
<                 args[i].data = split_tensor_into_1d_equal_chunks(args[i].data)
<                 args[i].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(args[i].data)
---
>             ctx.input_0_shape = args[0].data.shape
>             args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
>             args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
>                 args[0].data)
290,293d283
<         # if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
<         #     inputs[0].data = gather_split_1d_tensor(inputs[0].data)
<         #     inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
< 
295,297c285,286
<             for i in range(len(inputs)):
<                 inputs[i].data = gather_split_1d_tensor(inputs[i].data)
<                 inputs[i].data = inputs[i].data.view(ctx.inputs_shape)
---
>             inputs[0].data = gather_split_1d_tensor(inputs[0].data)
>             inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
321d309
< 
324,325c312
<                       for inp in detached_inputs)   
< 
---
>                       for inp in detached_inputs)
diff --color -r runtime/megatron/optimizer/__init__.py ../Megatron-LM-base/megatron/optimizer/__init__.py
103d102
<                                                  args.use_contiguous_buffers_in_ddp,
110c109
<                          params_have_main_grad, args.use_contiguous_buffers_in_ddp)
---
>                          params_have_main_grad)
diff --color -r runtime/megatron/optimizer/optimizer.py ../Megatron-LM-base/megatron/optimizer/optimizer.py
25a26
> from megatron import get_timers
31,32d31
< import os
< SKIP_WHEN_INF = os.environ.get("SKIP_WHEN_INF", '0') == '1'
191c190
<                  params_have_main_grad, use_contiguous_buffers_in_ddp, bf16, grad_scaler):
---
>                  params_have_main_grad, bf16, grad_scaler):
196,197d194
<         ## new Megatron
<         self.use_contiguous_buffers_in_ddp = use_contiguous_buffers_in_ddp
286,294d282
<     # def zero_grad(self, set_to_none=True):
<     #     """We only need to zero the model related parameters, i.e.,
<     #             float16_groups & fp32_from_fp32_groups."""
<     #     for group in self.float16_groups:
<     #         _zero_grad_group_helper(group, set_to_none)
<     #     for group in self.fp32_from_fp32_groups:
<     #         _zero_grad_group_helper(group, set_to_none)
< 
<     ## New Megatron
297,300c285
<         float16_groups & fp32_from_fp32_groups. We additionally zero
<         fp32_from_float16_groups as a memory optimization to reduce
<         fragmentation; in the case of set_to_none==True, the space
<         used by this field can be safely deallocated at this point."""
---
>                 float16_groups & fp32_from_fp32_groups."""
303,304d287
<         for group in self.fp32_from_float16_groups:
<             _zero_grad_group_helper(group, set_to_none)
307a291
> 
319,321c303,305
<                 if self.params_have_main_grad: # and hasattr(model_param, 'main_grad'): ?
<                     main_param.grad = model_param.main_grad.float()         
<                 else:                      
---
>                 if self.params_have_main_grad:
>                     main_param.grad = model_param.main_grad.float()
>                 else:
324,333d307
< 
<                 ## from new Megatron
<                 # Safe to deallocate model's grad/main_grad after copying.
<                 # (If using contiguous buffers, main_grad's memory should
<                 # persist and therefore should not be deallocated.)
<                 model_param.grad = None
<                 if self.params_have_main_grad and \
<                    not self.use_contiguous_buffers_in_ddp:
<                     model_param.main_grad = None
< 
340,346d313
<                     ## from new Megatron
<                     # Safe to de-reference model's main_grad after copying.
<                     # (If using contiguous buffers, main_grad's memory should
<                     # persist and therefore should not be deallocated.)
<                     if not self.use_contiguous_buffers_in_ddp:
<                         model_param.main_grad = None
< 
406a374,375
>         timers = get_timers()
> 
407a377
>         timers('optimizer-copy-to-main-grad').start()
408a379
>         timers('optimizer-copy-to-main-grad').stop()
414a386
>             timers('optimizer-unscale-and-check-inf').start()
415a388
>             timers('optimizer-unscale-and-check-inf').stop()
422c395
<             if found_inf_flag and SKIP_WHEN_INF:
---
>             if found_inf_flag:
426c399
< 
---
>         timers('optimizer-clip-main-grad').start()
429a403
>         timers('optimizer-clip-main-grad').stop()
438a413
>         timers('optimizer-copy-main-to-model-params').start()
439a415
>         timers('optimizer-copy-main-to-model-params').stop()
491c467
<                  params_have_main_grad, use_contiguous_buffers_in_ddp):
---
>                  params_have_main_grad):
498d473
<         self.use_contiguous_buffers_in_ddp = use_contiguous_buffers_in_ddp
522,529d496
< 
<                     ## from new Megatron.
<                     # Safe to de-reference model's main_grad after copying.
<                     # (If using contiguous buffers, main_grad's memory should
<                     # persist and therefore should not be deallocated.)
<                     if not self.use_contiguous_buffers_in_ddp:
<                         param.main_grad = None
< 
diff --color -r runtime/megatron/p2p_communication.py ../Megatron-LM-base/megatron/p2p_communication.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/p2p_communication.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
28c20
< from megatron import get_args, get_timers
---
> from megatron import get_args
30,46d21
< from megatron.utils import debug_mem_report
< from megatron.utils import report_memory
< 
< import os
< import time
< 
< DEBUG_COMMUNICATE = os.environ.get("DEBUG_COMMUNICATE", '0') == '1'
< EXTRA_TENSOR_TRANSFER = os.environ.get("EXTRA_TENSOR_TRANSFER", '1') == '1'
< 
< def print_tensor_dict_info(name, tensor_dict):
<     args = get_args()
<     string = f"rank {torch.distributed.get_rank()} {name} dict: \n"
<     for key in sorted(tensor_dict):
<         if tensor_dict[key] is not None:
<             string += f"{key}: {list(tensor_dict[key].size())} size = {reduce(operator.mul, list(tensor_dict[key].size()), 1)}\n"
<         else:
<             string += f"{key}: {None}\n"
48,49d22
<     with open(f"{args.log_path}{args.log_name}_debug_communicate_rank{torch.distributed.get_rank()}.log", "a+") as f:
<         f.write(string+"\n") 
51,55c24,39
< def print_communication_info(current_rank, op, other_rank, tensor_size):
<     args = get_args()
<     string = f"rank {current_rank} | {op} {other_rank}. size = {tensor_size}."
<     with open(f"{args.log_path}{args.log_name}_debug_communicate_rank{current_rank}.log", "a+") as f:
<         f.write(string+"\n")    
---
> def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
>                  use_ring_exchange=False):
>     """Communicate tensors between stages. Used as helper method in other
>     communication methods that are used in megatron/schedules.py.
> 
>     Takes the following arguments:
>         tensor_send_next: tensor to send to next rank (no tensor sent if
>                           set to None).
>         tensor_send_prev: tensor to send to prev rank (no tensor sent if
>                           set to None).
>         recv_prev: boolean for whether tensor should be received from
>                    previous rank.
>         recv_next: boolean for whether tensor should be received from
>                    next rank.
>         use_ring_exchange: boolean for whether torch.distributed.ring_exchange()
>                            API should be used.
57,84c41,42
< def _create_recv_placeholder(forward=True):
<     args = get_args()
<     dtype = args.params_dtype
<     if args.fp32_residual_connection:
<         dtype = torch.float   
< 
<     recv_info = mpu.get_recv_info(forward)
<     flatten_tensor_recv_prev = {}
<     for key in sorted(recv_info["tensors"]):
<         flatten_tensor_recv_prev[key] = []
<         num_chunks = recv_info["tensors"][key]["num_tp_chunks"] * recv_info["tensors"][key]["num_dp_chunks"]
<         recv_shape = list(recv_info["tensors"][key]["shape"])
<         if recv_info["tensors"][key]["tp_split_dim"] == -1 and args.scatter_gather_tensors_in_pipeline:
<             rank = mpu.get_pipeline_model_parallel_rank()
<             if forward:
<                 op_index = mpu.get_op_start_index(rank)
<             else:
<                 op_index = mpu.get_op_end_index(rank) - 1
< 
<             assert recv_shape[0] % mpu.get_op_tp_size(op_index) == 0
<             recv_shape[0] //= mpu.get_op_tp_size(op_index)
<             recv_shape[0] //= recv_info["tensors"][key]["num_tp_chunks"]
<         for _ in range(num_chunks):
<             flatten_tensor_recv_prev[key].append(torch.empty(recv_shape, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype))
< 
<     return flatten_tensor_recv_prev
< 
< def _partition(tensor, info, forward):
---
>     Returns:
>         (tensor_recv_prev, tensor_recv_next)
86,95d43
<     This function first partition each tensor and extra tensor according to number of receivers.
<     Then flatten all the tensors and concat them into one large tensor.
<     """
<     
<     tp_split_dim = info["tp_split_dim"]
<     dp_split_dim = info["dp_split_dim"]
<     num_tp_chunks = info["num_tp_chunks"]
<     num_dp_chunks = info["num_dp_chunks"]
<     tp_chunks_index = info["tp_chunks_index"]
<     dp_chunks_index = info["dp_chunks_index"]
98,102c46,53
<     if dp_split_dim != -1:
<         _tmp_list = list(torch.chunk(tensor, chunks=num_dp_chunks, dim=dp_split_dim)) 
<         tensor_split = []
<         for i in range(len(dp_chunks_index)):
<             tensor_split.append(_tmp_list[dp_chunks_index[i]].contiguous())
---
>     # Create placeholder tensors for receive in forward and backward directions
>     # if needed.
>     tensor_recv_prev = None
>     tensor_recv_next = None
>     tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
>     if args.scatter_gather_tensors_in_pipeline:
>         tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) // \
>             mpu.get_tensor_model_parallel_world_size()
104,111c55,84
<         tensor_split = [tensor for _ in range(num_dp_chunks)]
<     
<     if tp_split_dim != -1:
<         for i in range(len(tensor_split)):
<             _tmp_list = list(torch.chunk(tensor_split[i], chunks=num_tp_chunks, dim=tp_split_dim)) 
<             tensor_split[i] = []
<             for j in range(len(tp_chunks_index)):
<                 tensor_split[i].append(_tmp_list[tp_chunks_index[j]].contiguous())                
---
>         tensor_chunk_shape = tensor_shape
>     dtype = args.params_dtype
>     if args.fp32_residual_connection:
>         dtype = torch.float
>     if recv_prev:
>         tensor_recv_prev = torch.empty(tensor_chunk_shape,
>                                        requires_grad=True,
>                                        device=torch.cuda.current_device(),
>                                        dtype=dtype)
>     if recv_next:
>         tensor_recv_next = torch.empty(tensor_chunk_shape,
>                                        requires_grad=True,
>                                        device=torch.cuda.current_device(),
>                                        dtype=dtype)
> 
>     # Split tensor into smaller chunks if using scatter-gather optimization.
>     if args.scatter_gather_tensors_in_pipeline:
>         if tensor_send_next is not None:
>             tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)
> 
>         if tensor_send_prev is not None:
>             tensor_send_prev = mpu.split_tensor_into_1d_equal_chunks(tensor_send_prev)
> 
>     # Send tensors in both the forward and backward directions as appropriate.
>     if use_ring_exchange:
>         torch.distributed.ring_exchange(tensor_send_prev=tensor_send_prev,
>                                         tensor_recv_prev=tensor_recv_prev,
>                                         tensor_send_next=tensor_send_next,
>                                         tensor_recv_next=tensor_recv_next,
>                                         group=mpu.get_pipeline_model_parallel_group())
113,246c86,107
<         for i in range(len(tensor_split)):
<             if args.scatter_gather_tensors_in_pipeline:
<                 rank = mpu.get_pipeline_model_parallel_rank()
<                 if forward:
<                     op_index = mpu.get_op_end_index(rank) - 1
<                 else:
<                     op_index = mpu.get_op_start_index(rank)
< 
<                 assert tensor_split[i].size()[0] >= num_tp_chunks * mpu.get_op_tp_size(op_index), "scatter_gather_tensors_in_pipeline is only available when mciro batch size >= num_splits"
<                 _tmp_list = list(torch.chunk(tensor_split[i], chunks=num_tp_chunks * mpu.get_op_tp_size(op_index), dim=0)) 
<                 tp_rank = torch.distributed.get_rank(group=mpu.get_tensor_model_parallel_group(op_index))
<                 new_tensor_split = [_tmp_list[num_tp_chunks * tp_rank + j].contiguous() for j in range(num_tp_chunks)]
<             else:
<                 new_tensor_split = [tensor_split[i] for _ in range(num_tp_chunks)]
<             tensor_split[i] = new_tensor_split
< 
<     _tensor_split = [n for a in tensor_split for n in a]
< 
<     return _tensor_split
< 
< def _reshape(recv_tensor, recv_info, forward):
<     args = get_args()
<     tensor_dict = {}
<     extra_tensor_dict = {}
< 
<     for key in sorted(recv_info["tensors"]):
<         num_tp_chunks = recv_info["tensors"][key]["num_tp_chunks"]
<         num_dp_chunks = recv_info["tensors"][key]["num_dp_chunks"]
<         tp_split_dim = recv_info["tensors"][key]["tp_split_dim"]
<         dp_split_dim = recv_info["tensors"][key]["dp_split_dim"]
<         tensor_list = recv_tensor[key]       
< 
<         if not EXTRA_TENSOR_TRANSFER and recv_info["tensors"][key]["extra_tensor"]:
<             data_size = tensor_list[0].size()
<             if args.model_name == "resnet":
<                 data_type = torch.float32
<             else:
<                 data_type = torch.float16
<             for i in range(len(tensor_list)):
<                 tensor_list[i] = torch.ones(data_size, requires_grad=True, device=torch.cuda.current_device(), dtype=data_type) 
< 
<         if num_tp_chunks > 1:
<             if tp_split_dim == -1 and args.scatter_gather_tensors_in_pipeline:
<                 _tensor_list = []
<                 for i in range(len(tensor_list)):
<                     _tensor_list.append(torch.cat(tensor_list[i: i+num_tp_chunks], dim=0))
<                     i += num_tp_chunks
<                 tensor_list = _tensor_list  
<             else:
<                 _tensor_list = []
<                 for i in range(len(tensor_list)):
<                     _tensor_list.append(torch.cat(tensor_list[i: i+num_tp_chunks], dim=tp_split_dim))
<                     i += num_tp_chunks
<                 tensor_list = _tensor_list  
< 
<         if num_dp_chunks > 1:
<             _tensor_list = []
<             for i in range(len(tensor_list)):
<                 _tensor_list.append(torch.cat(tensor_list[i: i+num_dp_chunks], dim=dp_split_dim))
<                 i += num_dp_chunks
<             tensor_list = _tensor_list  
< 
<         if tp_split_dim == -1 and args.scatter_gather_tensors_in_pipeline:
<             rank = mpu.get_pipeline_model_parallel_rank()
<             if forward:
<                 op_index = mpu.get_op_start_index(rank)  
<             else:
<                 op_index = mpu.get_op_end_index(rank) - 1
<             tp_size = mpu.get_op_tp_size(op_index)
< 
<             gather_list = [torch.empty_like(tensor_list[0]) for _ in range(tp_size)]
<             torch.distributed.all_gather(gather_list, tensor_list[0], group=mpu.get_tensor_model_parallel_group(op_index))
<             output = torch.cat(gather_list, dim=0).contiguous()
< 
<             if recv_info["tensors"][key]["extra_tensor"]:
<                 extra_tensor_dict[key] = output
<             else:
<                 tensor_dict[key] = output
<         else:
<             if recv_info["tensors"][key]["extra_tensor"]:
<                 extra_tensor_dict[key] = tensor_list[0]  
<             else:
<                 tensor_dict[key] = tensor_list[0]
< 
<     if DEBUG_COMMUNICATE:
<         print_tensor_dict_info("recieved tensors", tensor_dict)
<         print_tensor_dict_info("received extra tensors", extra_tensor_dict)
< 
<     return tensor_dict, extra_tensor_dict
< 
< def _communicate_flexpipe(tensor_send_next, tensor_send_prev, extra_tensor_send_next, extra_tensor_send_prev, recv_prev, recv_next):
< 
<     timers = get_timers()
< 
<     prev_ranks = mpu.get_stage_comm_recv_ranks()
<     next_ranks = mpu.get_stage_comm_send_ranks()
<     num_parents = len(prev_ranks)
<     num_childs = len(next_ranks)      
<     tensor_recv_prev, extra_tensor_recv_prev, tensor_recv_next, extra_tensor_recv_next = None, None, None, None 
< 
<     # Create placeholder tensors for receive in forward and backward directions if needed.
<     with torch.no_grad():
<         if recv_prev:
<             flatten_tensor_recv_prev = _create_recv_placeholder(forward=True)
<         if recv_next:
<             flatten_tensor_recv_next = _create_recv_placeholder(forward=False)
< 
<     if tensor_send_prev is not None:
<         send_info = mpu.get_send_info(forward=False)
<         for key in sorted(send_info["tensors"]):
<             ops = []
<             with torch.no_grad():
<                 if key in tensor_send_prev:
<                     tensor_partitioned = _partition(tensor_send_prev[key], send_info["tensors"][key], forward=False)
<                 elif key in extra_tensor_send_prev:
<                     if EXTRA_TENSOR_TRANSFER:
<                         tensor_partitioned = _partition(extra_tensor_send_prev[key], send_info["tensors"][key], forward= False)
<                     else:
<                         continue
<                 else:
<                     print(f"[rank {torch.distributed.get_rank()}] trying to send to prev, tensor name = {key}. send_info = {send_info['tensors']}")
<             for i in range(num_parents):
<                 send_prev_op = torch.distributed.P2POp(torch.distributed.isend, tensor_partitioned[i], prev_ranks[i])
<                 ops.append(send_prev_op)  
<                 if DEBUG_COMMUNICATE:
<                     print_communication_info(torch.distributed.get_rank(), f"send [{key} ({tensor_partitioned[i].dtype})] to ", prev_ranks[i], list(tensor_partitioned[i].size()))
<             if recv_prev:
<                 recv_info = mpu.get_recv_info(forward=True)
<                 for i in range(num_parents):
<                     recv_prev_op = torch.distributed.P2POp(torch.distributed.irecv, flatten_tensor_recv_prev[key][i], prev_ranks[i])
<                     ops.append(recv_prev_op)
<                     if DEBUG_COMMUNICATE:
<                         print_communication_info(torch.distributed.get_rank(), f"recv [{key}] from ", prev_ranks[i], list(flatten_tensor_recv_prev[key][i].size()))                
< 
---
>         ops = []
>         if tensor_send_prev is not None:
>             send_prev_op = torch.distributed.P2POp(
>                 torch.distributed.isend, tensor_send_prev,
>                 mpu.get_pipeline_model_parallel_prev_rank())
>             ops.append(send_prev_op)
>         if tensor_recv_prev is not None:
>             recv_prev_op = torch.distributed.P2POp(
>                 torch.distributed.irecv, tensor_recv_prev,
>                 mpu.get_pipeline_model_parallel_prev_rank())
>             ops.append(recv_prev_op)
>         if tensor_send_next is not None:
>             send_next_op = torch.distributed.P2POp(
>                 torch.distributed.isend, tensor_send_next,
>                 mpu.get_pipeline_model_parallel_next_rank())
>             ops.append(send_next_op)
>         if tensor_recv_next is not None:
>             recv_next_op = torch.distributed.P2POp(
>                 torch.distributed.irecv, tensor_recv_next,
>                 mpu.get_pipeline_model_parallel_next_rank())
>             ops.append(recv_next_op)
>         if len(ops) > 0:
250,316d110
<             # torch.cuda.synchronize()
<     elif recv_prev:
<         recv_info = mpu.get_recv_info(forward=True)
<         for key in sorted(recv_info["tensors"]): 
<             if recv_info["tensors"][key]["extra_tensor"] and not EXTRA_TENSOR_TRANSFER:
<                 continue
<             ops = []    
<             for i in range(num_parents):
<                 recv_prev_op = torch.distributed.P2POp(torch.distributed.irecv, flatten_tensor_recv_prev[key][i], prev_ranks[i])
<                 ops.append(recv_prev_op)
<                 if DEBUG_COMMUNICATE:
<                     print_communication_info(torch.distributed.get_rank(), f"recv [{key}] from ", prev_ranks[i], list(flatten_tensor_recv_prev[key][i].size()))
< 
<             reqs = torch.distributed.batch_isend_irecv(ops)
<             for req in reqs:
<                 req.wait()  
<             # torch.cuda.synchronize()        
< 
<     if tensor_send_next is not None:
<         send_info = mpu.get_send_info(forward=True)
<         for key in sorted(send_info["tensors"]):
<             ops = []
<             with torch.no_grad():
<                 if key in tensor_send_next:
<                     tensor_partitioned = _partition(tensor_send_next[key], send_info["tensors"][key], forward=True)
<                 elif key in extra_tensor_send_next:
<                     if EXTRA_TENSOR_TRANSFER:
<                         tensor_partitioned = _partition(extra_tensor_send_next[key], send_info["tensors"][key], forward=True) 
<                     else:
<                         continue
<             for i in range(num_childs):
<                 send_next_op = torch.distributed.P2POp(torch.distributed.isend, tensor_partitioned[i], next_ranks[i])
<                 ops.append(send_next_op)  
<                 if DEBUG_COMMUNICATE:
<                     print_communication_info(torch.distributed.get_rank(), f"send [{key}] to ", next_ranks[i], list(tensor_partitioned[i].size()))
<             if recv_next:
<                 recv_info = mpu.get_recv_info(forward=False)
<                 for i in range(num_childs):
<                     recv_next_op = torch.distributed.P2POp(torch.distributed.irecv, flatten_tensor_recv_next[key][i], next_ranks[i])
<                     ops.append(recv_next_op)
<                     if DEBUG_COMMUNICATE:
<                         print_communication_info(torch.distributed.get_rank(), f"recv [{key}] from ", next_ranks[i], list(flatten_tensor_recv_next[key][i].size()))                
< 
<             reqs = torch.distributed.batch_isend_irecv(ops)
<             for req in reqs:
<                 req.wait()
<             # torch.cuda.synchronize()
< 
<     elif recv_next:
<         recv_info = mpu.get_recv_info(forward=False)
<         for key in sorted(recv_info["tensors"]): 
<             if recv_info["tensors"][key]["extra_tensor"] and not EXTRA_TENSOR_TRANSFER:
<                 continue            
<             ops = []          
<             for i in range(num_childs):
<                 recv_next_op = torch.distributed.P2POp(torch.distributed.irecv, flatten_tensor_recv_next[key][i], next_ranks[i])
<                 ops.append(recv_next_op)
<                 if DEBUG_COMMUNICATE:
<                     print_communication_info(torch.distributed.get_rank(), f"recv [{key}] from ", next_ranks[i], list(flatten_tensor_recv_next[key][i].size()))  
< 
<             reqs = torch.distributed.batch_isend_irecv(ops)
<             for req in reqs:
<                 req.wait()  
<     # if len(ops) > 0:
<     #     reqs = torch.distributed.batch_isend_irecv(ops)
<     #     for req in reqs:
<     #         req.wait()
320c114,115
<     with torch.no_grad():
---
>     # If using scatter-gather optimization, gather smaller chunks.
>     if args.scatter_gather_tensors_in_pipeline:
322c117,119
<             tensor_recv_prev, extra_tensor_recv_prev = _reshape(flatten_tensor_recv_prev, recv_info, forward=True)
---
>             tensor_recv_prev = mpu.gather_split_1d_tensor(
>                 tensor_recv_prev).view(tensor_shape).requires_grad_()
> 
324c121,122
<             tensor_recv_next, extra_tensor_recv_next = _reshape(flatten_tensor_recv_next, recv_info, forward=False)
---
>             tensor_recv_next = mpu.gather_split_1d_tensor(
>                 tensor_recv_next).view(tensor_shape).requires_grad_()
326,335c124
<     if recv_prev:
<         for key in sorted(tensor_recv_prev):
<             tensor_recv_prev[key].requires_grad = True
<         for key in sorted(extra_tensor_recv_prev):
<             extra_tensor_recv_prev[key].requires_grad = True    
<     if recv_next:
<         for key in sorted(tensor_recv_next):
<             tensor_recv_next[key].requires_grad = True
<         for key in sorted(extra_tensor_recv_next):
<             extra_tensor_recv_next[key].requires_grad = True                    
---
>     return tensor_recv_prev, tensor_recv_next
337d125
<     return tensor_recv_prev, extra_tensor_recv_prev, tensor_recv_next, extra_tensor_recv_next
340,341c128
<     """Receive tensor from previous rank in pipeline (forward receive)."""  
< 
---
>     """Receive tensor from previous rank in pipeline (forward receive)."""
343,344c130
<         input_tensors = None
<         input_extra_tensors = None
---
>         input_tensor = None
348,354c134,137
< 
<         input_tensors, input_extra_tensors, _, _  = _communicate_flexpipe(
<             tensor_send_next=None, 
<             tensor_send_prev=None, 
<             extra_tensor_send_next=None, 
<             extra_tensor_send_prev=None, 
<             recv_prev=True, 
---
>         input_tensor, _ = _communicate(
>             tensor_send_next=None,
>             tensor_send_prev=None,
>             recv_prev=True,
356d138
<             
359,360c141
< 
<     return input_tensors, input_extra_tensors
---
>     return input_tensor
365d145
< 
368d147
<         output_extra_tensors_grad = None
372,378c151,154
< 
<         _, _, output_tensor_grad, output_extra_tensors_grad  = _communicate_flexpipe(
<             tensor_send_next=None, 
<             tensor_send_prev=None, 
<             extra_tensor_send_next=None, 
<             extra_tensor_send_prev=None, 
<             recv_prev=False, 
---
>         _, output_tensor_grad = _communicate(
>             tensor_send_next=None,
>             tensor_send_prev=None,
>             recv_prev=False,
380d155
< 
383,384c158
< 
<     return output_tensor_grad, output_extra_tensors_grad
---
>     return output_tensor_grad
387c161
< def send_forward(output_tensor, output_extra_tensors, timers=None):
---
> def send_forward(output_tensor, timers=None):
389d162
< 
393,399c166,169
< 
<         _communicate_flexpipe(
<             tensor_send_next=output_tensor, 
<             tensor_send_prev=None, 
<             extra_tensor_send_next=output_extra_tensors, 
<             extra_tensor_send_prev=None, 
<             recv_prev=False, 
---
>         _communicate(
>             tensor_send_next=output_tensor,
>             tensor_send_prev=None,
>             recv_prev=False,
401d170
< 
406c175
< def send_backward(input_tensor_grad, extra_tensors_grad, timers=None):
---
> def send_backward(input_tensor_grad, timers=None):
408d176
< 
412,418c180,183
< 
<         _communicate_flexpipe(
<             tensor_send_next=None, 
<             tensor_send_prev=input_tensor_grad, 
<             extra_tensor_send_next=None, 
<             extra_tensor_send_prev=extra_tensors_grad, 
<             recv_prev=False, 
---
>         _communicate(
>             tensor_send_next=None,
>             tensor_send_prev=input_tensor_grad,
>             recv_prev=False,
420d184
< 
425c189
< def send_forward_recv_backward(output_tensor, output_extra_tensors, timers=None):
---
> def send_forward_recv_backward(output_tensor, timers=None):
427d190
< 
430d192
<         output_extra_tensors_grad = None
434,440c196,199
< 
<         _, _, output_tensor_grad, output_extra_tensors_grad = _communicate_flexpipe(
<             tensor_send_next=output_tensor, 
<             tensor_send_prev=None, 
<             extra_tensor_send_next=output_extra_tensors, 
<             extra_tensor_send_prev=None, 
<             recv_prev=False, 
---
>         _, output_tensor_grad = _communicate(
>             tensor_send_next=output_tensor,
>             tensor_send_prev=None,
>             recv_prev=False,
442d200
< 
444a203
>     return output_tensor_grad
446d204
<     return output_tensor_grad, output_extra_tensors_grad
448,449c206
< 
< def send_backward_recv_forward(input_tensor_grad, extra_tensors_grad, timers=None):
---
> def send_backward_recv_forward(input_tensor_grad, timers=None):
451d207
< 
454d209
<         extra_tensors = None
458,464c213,216
< 
<         input_tensor, extra_tensors, _, _ = _communicate_flexpipe(
<             tensor_send_next=None, 
<             tensor_send_prev=input_tensor_grad, 
<             extra_tensor_send_next=None, 
<             extra_tensor_send_prev=extra_tensors_grad, 
<             recv_prev=True, 
---
>         input_tensor, _ = _communicate(
>             tensor_send_next=None,
>             tensor_send_prev=input_tensor_grad,
>             recv_prev=True,
466d217
< 
468a220
>     return input_tensor
470d221
<     return input_tensor, extra_tensors
472c223
< def send_forward_recv_forward(output_tensor, output_extra_tensors, recv_prev, timers=None):
---
> def send_forward_recv_forward(output_tensor, recv_prev, timers=None):
474d224
< 
477,483c227,230
< 
<     input_tensor, extra_tensors, _, _ = _communicate_flexpipe(
<         tensor_send_next=output_tensor, 
<         tensor_send_prev=None, 
<         extra_tensor_send_next=output_extra_tensors, 
<         extra_tensor_send_prev=None, 
<         recv_prev=recv_prev, 
---
>     input_tensor, _ = _communicate(
>         tensor_send_next=output_tensor,
>         tensor_send_prev=None,
>         recv_prev=recv_prev,
485d231
< 
487a234
>     return input_tensor
489d235
<     return input_tensor, extra_tensors
491,492c237
< 
< def send_backward_recv_backward(input_tensor_grad, extra_tensors_grad, recv_next, timers=None):
---
> def send_backward_recv_backward(input_tensor_grad, recv_next, timers=None):
494d238
< 
497,503c241,244
< 
<     _, _, output_tensor_grad, output_extra_tensors_grad = _communicate_flexpipe(
<         tensor_send_next=None, 
<         tensor_send_prev=input_tensor_grad, 
<         extra_tensor_send_next=None, 
<         extra_tensor_send_prev=extra_tensors_grad, 
<         recv_prev=False, 
---
>     _, output_tensor_grad = _communicate(
>         tensor_send_next=None,
>         tensor_send_prev=input_tensor_grad,
>         recv_prev=False,
505d245
< 
508,509c248
< 
<     return output_tensor_grad, output_extra_tensors_grad
---
>     return output_tensor_grad
513,516c252,254
<         output_tensor, output_extra_tensors, input_tensor_grad, extra_tensors_grad, 
<         recv_prev, recv_next, timers=None):
<     """Batched send and recv with previous and next ranks in pipeline."""  
< 
---
>         output_tensor, input_tensor_grad, recv_prev,
>         recv_next, timers=None):
>     """Batched send and recv with previous and next ranks in pipeline."""
519,525c257,260
< 
<     input_tensor, extra_tensors, output_tensor_grad, output_extra_tensors_grad = _communicate_flexpipe(
<         tensor_send_next=output_tensor, 
<         tensor_send_prev=input_tensor_grad, 
<         extra_tensor_send_next=output_extra_tensors, 
<         extra_tensor_send_prev=extra_tensors_grad, 
<         recv_prev=recv_prev, 
---
>     input_tensor, output_tensor_grad = _communicate(
>         tensor_send_next=output_tensor,
>         tensor_send_prev=input_tensor_grad,
>         recv_prev=recv_prev,
527d261
< 
530,531c264
< 
<     return input_tensor, extra_tensors, output_tensor_grad, output_extra_tensors_grad
---
>     return input_tensor, output_tensor_grad
diff --color -r runtime/megatron/schedules.py ../Megatron-LM-base/megatron/schedules.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/schedules.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
27d18
< from torch.autograd.variable import Variable
49c40,41
< def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced, extra_tensors = None):
---
> 
> def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
56d47
<  
57a49
> 
62,64c54
< 
<     output_tensor, output_extra_tensors, loss_func = forward_step_func(data_iterator, model, extra_tensors)
< 
---
>     output_tensor, loss_func = forward_step_func(data_iterator, model)
72,74c62
<     return output_tensor, output_extra_tensors
< 
< def update_output_extra_tensors_grad(output_extra_tensors, output_extra_tensors_grad):
---
>     return output_tensor
76,78d63
<     if output_extra_tensors_grad is not None and output_extra_tensors is not None:
<         for key in sorted(output_extra_tensors):
<             output_extra_tensors[key].grad = output_extra_tensors_grad[key]
80,157c65
< def retain_input_tensors_grad_and_check_output_grad(input_tensor, extra_tensors, output_tensor, output_tensor_grad):
< 
<     if input_tensor is not None:
<         for key in sorted(input_tensor):
<             input_tensor[key].retain_grad()
< 
<     if extra_tensors is not None:
<         for key in sorted(extra_tensors):
<             if extra_tensors[key].requires_grad:
<                 extra_tensors[key].retain_grad()
< 
<     output_tensor_list = None
<     output_tensor_grad_list = None
< 
<     if output_tensor is not None:
<         if isinstance(output_tensor, dict):
<             output_tensor_list = []
<             for key in sorted(output_tensor):
<                 output_tensor_list.append(output_tensor[key])
<         else:
<             output_tensor_list = output_tensor
< 
<     if output_tensor_grad is not None:
<         if isinstance(output_tensor_grad, dict):
<             output_tensor_grad_list = []
<             for key in sorted(output_tensor_grad):
<                 output_tensor_grad_list.append(output_tensor_grad[key])
<         else:
<             output_tensor_grad_list = output_tensor_grad
< 
<     return output_tensor_list, output_tensor_grad_list
< 
< def collect_grad_of_input_and_extra_tensors(input_tensor, extra_tensors):
<     input_tensor_grad = None
<     extra_tensors_grad = None
<     if input_tensor is not None:
<         input_tensor_grad = {}
<         for key in sorted(input_tensor):
<             if input_tensor[key].grad is None:
<                 input_tensor_grad[key] = torch.zeros(list(input_tensor[key].size()), requires_grad=False, device=torch.cuda.current_device(), dtype=torch.float16)
<             else:
<                 input_tensor_grad[key] = input_tensor[key].grad
< 
<     # When we want to send back the gradients of some extra tensors (encoder_output),
<     # its gradients may not be calculated yet, current workaround: send back zero values.
<     if extra_tensors is not None:
<         extra_tensors_grad = {}
<         for key in sorted(extra_tensors):
<             if extra_tensors[key].grad is None:
<                 extra_tensors_grad[key] = torch.zeros(list(extra_tensors[key].size()), requires_grad=False, device=torch.cuda.current_device(), dtype=torch.float16)
<             else:
<                 extra_tensors_grad[key] = extra_tensors[key].grad
< 
<     return input_tensor_grad, extra_tensors_grad
< 
< def deallocate_output_tensor(out_dict):
<     '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
< 
<     This method should be called right after the output tensor has been
<     sent to the next pipeline stage. At this point, the output tensor is
<     only useful for its '.grad_fn' field, and not its '.data'.
<     '''
<     if out_dict is None or isinstance(out_dict, torch.Tensor):
<         return
<     assert isinstance(out_dict, dict), f"dict of tensors is required. instead of: {out_dict}"
<     for name in out_dict:
<         assert isinstance(out_dict[name], torch.Tensor), \
<             "expected Tensor, found %s." % type(out_dict[name]).__name__
<         assert out_dict[name]._base is None, \
<             f"counter-productive to free a view of another tensor. rank {torch.distributed.get_rank()}, tensor: {name}"
<         out_dict[name].data = torch.empty(
<             (1,),
<             device = out_dict[name].device,
<             dtype = out_dict[name].dtype,
<     )
<         
< 
< def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, extra_tensors=None, output_extra_tensors=None, output_extra_tensors_grad=None, model=None):
---
> def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
165d72
< 
167d73
<     timers = get_timers()
168a75
>     timers = get_timers()
171,172c78,80
<     update_output_extra_tensors_grad(output_extra_tensors, output_extra_tensors_grad)
<     output_tensor, output_tensor_grad = retain_input_tensors_grad_and_check_output_grad(input_tensor, extra_tensors, output_tensor, output_tensor_grad)
---
>     # Retain the grad on the input_tensor.
>     if input_tensor is not None:
>         input_tensor.retain_grad()
175c83
<     if output_tensor_grad is None:       
---
>     if output_tensor_grad is None:
177d84
<     
182,184c89,90
<     extra_tensors_grad = None
< 
<     input_tensor_grad, extra_tensors_grad = collect_grad_of_input_and_extra_tensors(input_tensor, extra_tensors)
---
>     if input_tensor is not None:
>         input_tensor_grad = input_tensor.grad
188c94,95
<     return input_tensor_grad, extra_tensors_grad
---
>     return input_tensor_grad
> 
196a104
> 
214c122
<             output_tensor, _= forward_step(forward_step_func, data_iterator, model,
---
>             output_tensor = forward_step(forward_step_func, data_iterator, model,
216d123
< 
219c126
<                               output_tensor_grad, model=model)
---
>                               output_tensor_grad)
223c130
<     output_tensor, _ = forward_step(forward_step_func, data_iterator, model,
---
>     output_tensor = forward_step(forward_step_func, data_iterator, model,
225d131
< 
227c133
<         backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, model=model)
---
>         backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad)
241,244d146
< 
<     input_extra_tensors_list = [[] for _ in range(len(model))]
<     output_extra_tensors_list = [[] for _ in range(len(model))] 
< 
247d148
<         output_extra_tensor_grads_list = [[] for _ in range(len(model))]
278,285d178
<     def list_append_helper(tensor_list, model_chunk_id, new_tensor, name=None):
<         tensor_list[model_chunk_id].append(new_tensor)      
<     
<     def list_pop_helper(tensor_list, model_chunk_id, name=None):
<         popped_tensor = tensor_list[model_chunk_id].pop(0)
< 
<         return popped_tensor
< 
304,305c197
<                 list_append_helper(input_tensors, model_chunk_id, None, "input_tensor")
<                 list_append_helper(input_extra_tensors_list, model_chunk_id, None, "input_extra_tensors")
---
>                 input_tensors[model_chunk_id].append(None)
307,309c199
<         extra_tensors = input_extra_tensors_list[model_chunk_id][-1]
< 
<         output_tensor, output_extra_tensors = forward_step(forward_step_func,
---
>         output_tensor = forward_step(forward_step_func,
312,315c202,203
<                                      input_tensor, losses_reduced, extra_tensors)
< 
<         list_append_helper(output_tensors, model_chunk_id, output_tensor, "output_tensor")
<         list_append_helper(output_extra_tensors_list, model_chunk_id, output_extra_tensors, "output_extra_tensors")
---
>                                      input_tensor, losses_reduced)
>         output_tensors[model_chunk_id].append(output_tensor)
317c205
<         return output_tensor, output_extra_tensors
---
>         return output_tensor
325d212
<         mpu.set_virtual_pipeline_backward_model_parallel_rank(model_chunk_id)
329,341c216,221
<                 list_append_helper(output_tensor_grads, model_chunk_id, None, "output_tensor_grads")
<                 list_append_helper(output_extra_tensor_grads_list, model_chunk_id, None, "output_extra_tensor_grads")
< 
<         input_tensor = list_pop_helper(input_tensors, model_chunk_id, "input_tensor")
<         output_tensor = list_pop_helper(output_tensors, model_chunk_id, "output_tensor")
<         output_tensor_grad = list_pop_helper(output_tensor_grads, model_chunk_id, "output_tensor_grad")   
< 
<         extra_tensors = list_pop_helper(input_extra_tensors_list, model_chunk_id, "input_extra_tensors")
<         output_extra_tensors = list_pop_helper(output_extra_tensors_list, model_chunk_id, "output_extra_tensors")
<         output_extra_tensors_grad = list_pop_helper(output_extra_tensor_grads_list, model_chunk_id, "output_extra_tensors_grad")
< 
<         input_tensor_grad, extra_tensors_grad = \
<             backward_step(optimizer,  
---
>                 output_tensor_grads[model_chunk_id].append(None)
>         input_tensor = input_tensors[model_chunk_id].pop(0)
>         output_tensor = output_tensors[model_chunk_id].pop(0)
>         output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0)
>         input_tensor_grad = \
>             backward_step(optimizer,
344,347c224
<                           output_tensor_grad,
<                           extra_tensors,
<                           output_extra_tensors,
<                           output_extra_tensors_grad)
---
>                           output_tensor_grad)
349c226
<         return input_tensor_grad, extra_tensors_grad
---
>         return input_tensor_grad
353,360c230,231
<     mpu.set_virtual_pipeline_backward_model_parallel_rank(0)
<     mpu.set_virtual_pipeline_next_forward_model_rank(0)
<     mpu.set_virtual_pipeline_next_backward_model_rank(0)
<     input_tensor, extra_tensors = p2p_communication.recv_forward(timers)
< 
<     list_append_helper(input_tensors, 0, input_tensor, "input_tensor")
<     list_append_helper(input_extra_tensors_list, 0, extra_tensors, "input_extra_tensors")
< 
---
>     input_tensors[0].append(
>         p2p_communication.recv_forward(timers))
362,364c233
<         output_tensor, output_extra_tensors = forward_step_helper(k)
<         
<         current_forward_model_chunk_id = get_model_chunk_id(k, forward=True)
---
>         output_tensor = forward_step_helper(k)
368d236
< 
379d246
<             output_extra_tensors = None # number of extra tensors
383c250,251
<         if k == (num_warmup_microbatches - 1) and not forward_only and not all_warmup_microbatches:
---
>         if k == (num_warmup_microbatches - 1) and not forward_only and \
>                 not all_warmup_microbatches:
385,387d252
<             extra_tensors_grad = None
<             output_tensor_grad = None
<             output_extra_tensors_grad = None
391,396c256
< 
<             mpu.set_virtual_pipeline_model_parallel_rank(current_forward_model_chunk_id)
<             mpu.set_virtual_pipeline_next_forward_model_rank(next_forward_model_chunk_id)
<             mpu.set_virtual_pipeline_next_backward_model_rank(num_model_chunks-1)
< 
<             input_tensor, extra_tensors, output_tensor_grad, output_extra_tensors_grad = \
---
>             input_tensor, output_tensor_grad = \
398,403c258,261
<                         output_tensor, output_extra_tensors, input_tensor_grad, extra_tensors_grad,
<                         recv_prev=recv_prev, recv_next=recv_next, timers=timers)
< 
<             list_append_helper(output_tensor_grads, num_model_chunks-1, output_tensor_grad, "output_tensor_grads")
<             list_append_helper(output_extra_tensor_grads_list, num_model_chunks-1, output_extra_tensors_grad, "output_extra_tensor_grads")
< 
---
>                         output_tensor, input_tensor_grad,
>                         recv_prev=recv_prev, recv_next=recv_next,
>                         timers=timers)
>             output_tensor_grads[num_model_chunks-1].append(output_tensor_grad)
405,408c263
<             mpu.set_virtual_pipeline_model_parallel_rank(current_forward_model_chunk_id)
<             mpu.set_virtual_pipeline_next_forward_model_rank(next_forward_model_chunk_id)
< 
<             input_tensor, extra_tensors = \
---
>             input_tensor = \
410,413c265,266
<                     output_tensor, output_extra_tensors, recv_prev, timers)
< 
<         list_append_helper(input_tensors, next_forward_model_chunk_id, input_tensor, "input_tensor")
<         list_append_helper(input_extra_tensors_list, next_forward_model_chunk_id, extra_tensors, "input_extra_tensors")
---
>                     output_tensor, recv_prev, timers)
>         input_tensors[next_forward_model_chunk_id].append(input_tensor)
419c272
<         output_tensor, output_extra_tensors = forward_step_helper(forward_k)
---
>         output_tensor = forward_step_helper(forward_k)
423c276
<         input_tensor_grad, extra_tensors_grad = backward_step_helper(backward_k)
---
>         input_tensor_grad = backward_step_helper(backward_k)
434d286
<             output_extra_tensors = None
440d291
<             extra_tensors_grad = None
473,477d323
<         mpu.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
<         mpu.set_virtual_pipeline_backward_model_parallel_rank(backward_model_chunk_id)
<         mpu.set_virtual_pipeline_next_forward_model_rank(next_forward_model_chunk_id)
<         mpu.set_virtual_pipeline_next_backward_model_rank(next_backward_model_chunk_id)
< 
479c325
<         input_tensor, extra_tensors, output_tensor_grad, output_extra_tensors_grad= \
---
>         input_tensor, output_tensor_grad = \
481,482c327,329
<                     output_tensor, output_extra_tensors, input_tensor_grad, extra_tensors_grad,
<                     recv_prev=recv_prev, recv_next=recv_next, timers=timers)
---
>                     output_tensor, input_tensor_grad,
>                     recv_prev=recv_prev, recv_next=recv_next,
>                     timers=timers)
487,488c334
<             list_append_helper(input_tensors, next_forward_model_chunk_id, input_tensor, "input_tensor")
<             list_append_helper(input_extra_tensors_list, next_forward_model_chunk_id, extra_tensors, "input_extra_tensors")          
---
>             input_tensors[next_forward_model_chunk_id].append(input_tensor)
490,491c336,337
<             list_append_helper(output_tensor_grads, next_backward_model_chunk_id, output_tensor_grad, "output_tensor_grads")
<             list_append_helper(output_extra_tensor_grads_list, next_backward_model_chunk_id, output_extra_tensors_grad, "output_extra_tensor_grads")
---
>             output_tensor_grads[next_backward_model_chunk_id].append(
>                 output_tensor_grad)
496,499c342,343
<             output_tensor_grad, output_extra_tensors_grad = p2p_communication.recv_backward(timers)
<             list_append_helper(output_tensor_grads, num_model_chunks-1, output_tensor_grad, "output_tensor_grads")
<             list_append_helper(output_extra_tensor_grads_list, num_model_chunks-1, output_extra_tensors_grad, "output_extra_tensor_grads")
< 
---
>             output_tensor_grads[num_model_chunks-1].append(
>                 p2p_communication.recv_backward(timers))
501,502c345
<             input_tensor_grad, extra_tensors_grad = backward_step_helper(k)
<             backward_model_chunk_id = get_model_chunk_id(k, forward=False)
---
>             input_tensor_grad = backward_step_helper(k)
510,514c353
< 
<             mpu.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
<             mpu.set_virtual_pipeline_backward_model_parallel_rank(backward_model_chunk_id)
<             mpu.set_virtual_pipeline_next_backward_model_rank(next_backward_model_chunk_id)        
<             output_tensor_grad, output_extra_tensors_grad = \
---
>             output_tensor_grads[next_backward_model_chunk_id].append(
516,520c355
<                     input_tensor_grad, extra_tensors_grad, recv_next, timers)
< 
<             if recv_next:
<                 list_append_helper(output_tensor_grads, next_backward_model_chunk_id, output_tensor_grad, "output_tensor_grads")
<                 list_append_helper(output_extra_tensor_grads_list, next_backward_model_chunk_id, output_extra_tensors_grad, "output_extra_tensor_grads")              
---
>                     input_tensor_grad, recv_next, timers))
524,531d358
< def delete_tensors(tensor_dict):
<     if tensor_dict is not None:
<         saved_keys = []
<         for key in tensor_dict:
<             saved_keys.append(key)
<         for key in saved_keys:
<             del tensor_dict[key]
<         del tensor_dict
560,562d386
<     input_extra_tensors_list = []
<     output_extra_tensors_list = []
< 
564,568c388,392
<     for i in range(num_warmup_microbatches):  
<         input_tensor, extra_tensors = p2p_communication.recv_forward(timers)
< 
<         output_tensor, output_extra_tensors = forward_step(forward_step_func, data_iterator, model,
<                                      input_tensor, losses_reduced, extra_tensors)
---
>     for i in range(num_warmup_microbatches):
>         input_tensor = p2p_communication.recv_forward(timers)
>         output_tensor = forward_step(forward_step_func, data_iterator, model,
>                                      input_tensor, losses_reduced)
>         p2p_communication.send_forward(output_tensor, timers)
570,576c394,395
<         p2p_communication.send_forward(output_tensor, output_extra_tensors, timers)
< 
<         if not forward_only: 
<             input_tensors.append(input_tensor)
<             output_tensors.append(output_tensor)
<             input_extra_tensors_list.append(extra_tensors)
<             output_extra_tensors_list.append(output_extra_tensors)
---
>         input_tensors.append(input_tensor)
>         output_tensors.append(output_tensor)
581,582c400,401
<     if num_microbatches_remaining > 0:       
<         input_tensor, extra_tensors = p2p_communication.recv_forward(timers)
---
>     if num_microbatches_remaining > 0:
>         input_tensor = p2p_communication.recv_forward(timers)
585c404
<     for i in range(num_microbatches_remaining):      
---
>     for i in range(num_microbatches_remaining):
588,590c407,408
<         output_tensor, output_extra_tensors = forward_step(forward_step_func, data_iterator, model,
<                                      input_tensor, losses_reduced, extra_tensors)
< 
---
>         output_tensor = forward_step(forward_step_func, data_iterator, model,
>                                      input_tensor, losses_reduced)
592,595c410,414
<             p2p_communication.send_forward(output_tensor, output_extra_tensors, timers)
<         else:        
<             output_tensor_grad, output_extra_tensors_grad = \
<                 p2p_communication.send_forward_recv_backward(output_tensor, output_extra_tensors, timers)
---
>             p2p_communication.send_forward(output_tensor, timers)
>         else:
>             output_tensor_grad = \
>                 p2p_communication.send_forward_recv_backward(output_tensor,
>                                                              timers)
599,603c418,419
<         if not forward_only: 
<             input_tensors.append(input_tensor)
<             output_tensors.append(output_tensor)
<             input_extra_tensors_list.append(extra_tensors)
<             output_extra_tensors_list.append(output_extra_tensors)     
---
>         input_tensors.append(input_tensor)
>         output_tensors.append(output_tensor)
607c423
<                 input_tensor, extra_tensors = p2p_communication.recv_forward(timers)
---
>                 input_tensor = p2p_communication.recv_forward(timers)
610,614d425
<             extra_tensors, output_extra_tensors = input_extra_tensors_list.pop(0), output_extra_tensors_list.pop(0)
< 
<             input_tensor_grad, extra_tensors_grad = \
<                 backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, extra_tensors, output_extra_tensors, 
<                               output_extra_tensors_grad, model)
616,617c427,429
<             delete_tensors(output_extra_tensors)
<             delete_tensors(output_extra_tensors_grad)
---
>             input_tensor_grad = \
>                 backward_step(optimizer, input_tensor, output_tensor,
>                               output_tensor_grad)
621,623c433,435
<                 p2p_communication.send_backward(input_tensor_grad, extra_tensors_grad, timers)
<             else:            
<                 input_tensor, extra_tensors = \
---
>                 p2p_communication.send_backward(input_tensor_grad, timers)
>             else:
>                 input_tensor = \
625c437
<                         input_tensor_grad, extra_tensors_grad, timers)
---
>                         input_tensor_grad, timers)
629c441
<         for i in range(num_warmup_microbatches):        
---
>         for i in range(num_warmup_microbatches):
632,633d443
<             extra_tensors = input_extra_tensors_list.pop(0)
<             output_extra_tensors = output_extra_tensors_list.pop(0)
635c445
<             output_tensor_grad, output_extra_tensors_grad = p2p_communication.recv_backward(timers)
---
>             output_tensor_grad = p2p_communication.recv_backward(timers)
637,642c447,449
<             input_tensor_grad, extra_tensors_grad = \
<                 backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, extra_tensors, output_extra_tensors, 
<                               output_extra_tensors_grad, model)
< 
<             delete_tensors(output_extra_tensors)
<             delete_tensors(output_extra_tensors_grad)            
---
>             input_tensor_grad = \
>                 backward_step(optimizer, input_tensor, output_tensor,
>                               output_tensor_grad)
644c451
<             p2p_communication.send_backward(input_tensor_grad, extra_tensors_grad, timers)
---
>             p2p_communication.send_backward(input_tensor_grad, timers)
diff --color -r runtime/megatron/tokenizer/tokenizer.py ../Megatron-LM-base/megatron/tokenizer/tokenizer.py
23a24
> 
59,64c60,61
<     ## Modified for Aceso
<     max_mp_size = 0
<     for i in range(args.num_stages):
<         max_mp_size = max(max_mp_size, max(args.model_parallel_size_of_each_op[i]))
<     multiple = args.make_vocab_size_divisible_by * max_mp_size
< 
---
>     multiple = args.make_vocab_size_divisible_by * \
>         args.tensor_model_parallel_size
Only in ../Megatron-LM-base/megatron: training
diff --color -r runtime/megatron/training.py ../Megatron-LM-base/megatron/training.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/training.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
32c24
< # print(f"[DEBUG_TIME] at the beginning: _TRAIN_START_TIME = {_TRAIN_START_TIME}")
---
> 
34d25
< import numpy as np
63,64d53
< from megatron.mpu.utils import ensure_divisibility
< from torch.profiler import profile, record_function, ProfilerActivity
66,73c55
< import csv
< import os
< import gc
< 
< DEBUG_GRAD = os.environ.get("DEBUG_GRAD", '0') == '1'
< DEBUG_FIX_WEIGHT = os.environ.get("DEBUG_FIX_WEIGHT", '0') == '1'
< DEBUG_COMMUNICATE = os.environ.get("DEBUG_COMMUNICATE", '0') == '1'
< ENABLE_WEIGHT_SHARE = os.environ.get("ENABLE_WEIGHT_SHARE", '1') == '1'
---
> 
77d58
<     print(f"[rank {torch.distributed.get_rank()}]")
79d59
<     print(f"[rank {torch.distributed.get_rank()}] sync")
82a63
> 
120c101
<     start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
---
>     start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME])
124c105
<     print_rank_0('[TIME] time to initialize megatron (seconds): {:.3f}'.format(
---
>     print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
125a107
>     print_datetime('after megatron is initialized')
128a111
> 
133,135c116,117
< 
<     print_rank_0('[TIME] after model, optimizer, and learning rate scheduler are built (seconds): {:.3f}'.format(
<         time.time() - _TRAIN_START_TIME))
---
>     print_datetime('after model, optimizer, and learning rate '
>                    'scheduler are built')
139c121
<     if args.virtual_pipeline_model_parallel_size is not None and args.virtual_pipeline_model_parallel_size > 1:
---
>     if args.virtual_pipeline_model_parallel_size is not None:
152,154c134
< 
<     print_rank_0('[TIME] after dataloaders are built (seconds): {:.3f}'.format(
<         time.time() - _TRAIN_START_TIME))    
---
>     print_datetime('after dataloaders are built')
165a146
>     print_datetime('after training is done')
167,184c148,162
<     print_rank_0('[TIME] after training is done (seconds): {:.3f}'.format(
<         time.time() - _TRAIN_START_TIME))                              
< 
<     # if args.do_valid:
<     #     prefix = 'the end of training for val data'
<     #     evaluate_and_print_results(prefix, forward_step_func,
<     #                                valid_data_iterator, model,
<     #                                iteration, False)
< 
<     # if args.save and iteration != 0:
<     #     save_checkpoint(iteration, model, optimizer, lr_scheduler)
< 
<     # if args.do_test:
<     #     # Run on test data.
<     #     prefix = 'the end of training for test data'
<     #     evaluate_and_print_results(prefix, forward_step_func,
<     #                                test_data_iterator, model,
<     #                                0, True)
---
>     if args.do_valid:
>         prefix = 'the end of training for val data'
>         evaluate_and_print_results(prefix, forward_step_func,
>                                    valid_data_iterator, model,
>                                    iteration, False)
> 
>     if args.save and iteration != 0:
>         save_checkpoint(iteration, model, optimizer, lr_scheduler)
> 
>     if args.do_test:
>         # Run on test data.
>         prefix = 'the end of training for test data'
>         evaluate_and_print_results(prefix, forward_step_func,
>                                    test_data_iterator, model,
>                                    0, True)
215,496d192
< def set_weight(model, val=0.01):
<     for param in model.parameters():
<         param.data.fill_(val)
< 
< def get_op_via_index(op_index, models):
<     for model in models:
<         model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) 
<         for op in model.language_model.ops:
<             if op.op_index == op_index:
<                 return op
<     return None
< 
< def send_shared_tensors(op, models, grads=False):
<     
<     args = get_args()
<     shared_tensor = op.get_shared_tensor(grads=grads)
< 
<     for key in sorted(shared_tensor):
<         for op_index in op.shared_weights_info[key]["sharing_with_ops"]:
<             if not op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index]:
<                 recv_ranks = op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index]
<                 if len(recv_ranks) > 0:
<                     send_ops = []
<                     split_dim = op.shared_weights_info[key]["tp_split_dim"]
< 
<                     for recv_tp_groups in recv_ranks:
<                         if len(recv_tp_groups) == 1:
<                             tensor_list = [shared_tensor[key]]
<                         else:
<                             if split_dim != -1:
<                                 tensor_list = list(torch.chunk(shared_tensor[key], chunks=len(recv_tp_groups), dim=split_dim)) 
<                             else:
<                                 tensor_list = []
<                                 for _ in range(len(recv_tp_groups)):
<                                     tensor_list.append(shared_tensor[key])
< 
<                         for i in range(len(tensor_list)):
<                             send_op = torch.distributed.P2POp(
<                                 torch.distributed.isend, tensor_list[i].contiguous(), recv_tp_groups[i])
<                             send_ops.append(send_op) 
< 
<                             if DEBUG_COMMUNICATE:
<                                 current_rank = torch.distributed.get_rank()
<                                 string = f"(shared) rank {current_rank} send to {recv_tp_groups[i]} size = {list(tensor_list[i].size())}"
<                                 with open(f"{args.log_path}{args.log_name}_debug_communicate_rank{current_rank}.log", "a+") as f:
<                                     f.write(string+"\n")    
< 
<                     if len(send_ops) > 0:
<                         reqs = torch.distributed.batch_isend_irecv(send_ops)
<                         for req in reqs:
<                             req.wait()
<                         torch.cuda.synchronize()
< 
< def recv_shared_tensors(op, models, grads=False):
<     args = get_args()
< 
<     recv_dict = {}
<     shared_tensor = op.get_shared_tensor(grads=False)
<     for key in sorted(shared_tensor):
<         recv_dict[key] = []
< 
<     for key in sorted(shared_tensor):
<         if key == "position_embeddings" and not grads:
<             dtype = torch.float32
<         else:
<             dtype = args.params_dtype        
<         for op_index in op.shared_weights_info[key]["sharing_with_ops"]:
<             if op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index]:
<                 src_op = get_op_via_index(op_index, models)
<                 recv_tensor = src_op.get_shared_tensor(grads=grads)
<                 recv_dict[key].append(recv_tensor[key])
<             else:
<                 send_ranks = op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index]
<                 if len(send_ranks) > 0: 
<                     recv_ops = []
<                     tensor_list = []
<                     receive_size = list(shared_tensor[key].size())
<                     split_dim = op.shared_weights_info[key]["tp_split_dim"]
<                     if split_dim != -1:
<                         receive_size[split_dim] //= len(send_ranks[0])
<                         
<                     for send_tp_groups in send_ranks:
<                         tmp_tensor_list = []
<                         for _ in range(len(send_tp_groups)):
<                             tmp_tensor_list.append(torch.empty(receive_size, requires_grad=True, device=torch.cuda.current_device(), dtype=dtype))
<                         for i in range(len(tmp_tensor_list)):
<                             recv_op = torch.distributed.P2POp(
<                                 torch.distributed.irecv, tmp_tensor_list[i], send_tp_groups[i])
<                             recv_ops.append(recv_op)
<                             if DEBUG_COMMUNICATE:
<                                 current_rank = torch.distributed.get_rank()
<                                 string = f"(shared) rank {current_rank} recv from {send_tp_groups[i]} size = {list(tmp_tensor_list[i].size())}"
<                                 with open(f"{args.log_path}{args.log_name}_debug_communicate_rank{current_rank}.log", "a+") as f:
<                                     f.write(string+"\n")    
<                         tensor_list.append(tmp_tensor_list)
< 
<                     if len(recv_ops) > 0:
<                         reqs = torch.distributed.batch_isend_irecv(recv_ops)
<                         for req in reqs:
<                             req.wait()
<                         torch.cuda.synchronize()
< 
<                     if split_dim != -1:
<                         if len(tensor_list) == 1:
<                             recv_dict[key].append(torch.cat(tensor_list[0], dim=split_dim))
<                         else:
<                             result_tensor = torch.sum(torch.stack([torch.cat(tensor_list[i], dim=split_dim) for i in range(len(tensor_list))]), dim=0)
<                             recv_dict[key].append(result_tensor)
<                     else:
<                         if len(tensor_list) == 1:
<                             recv_dict[key].append(tensor_list[0][0])
<                         else:
<                             result_tensor = torch.sum(torch.stack([tensor_list[i][0] for i in range(len(tensor_list))]), dim=0)
<                             recv_dict[key].append(result_tensor)
<     return recv_dict
< 
< def initialize_weights_sharing(models):
<     if ENABLE_WEIGHT_SHARE:
<         pipeline_rank = mpu.get_pipeline_model_parallel_rank()
<         virtual_pipeline_rank = mpu.get_virtual_pipeline_model_parallel_rank()    
<         rank = torch.distributed.get_rank()
<         # initialize the ranks
<         for model in models:
<             model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) 
<             for op in model.language_model.ops:
<                 if len(op.shared_weights_info) > 0:
<                     for key in sorted(op.shared_weights_info):
<                         op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"] = {}   
<                         op.shared_weights_info[key]["sharing_weights_with_ranks"] = {}                      
<                         if op.shared_weights_info[key]["root"]:
<                             # calculate & store the destination ranks. 
<                             for op_index in op.shared_weights_info[key]["sharing_with_ops"]:
<                                 dest_pipeline_rank = mpu.get_pipeline_rank_via_op_index(op_index)
<                                 if dest_pipeline_rank == pipeline_rank:
<                                     op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index] = True
<                                 else:
<                                     op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index] = False
< 
<                                     ranks_in_send_stage = mpu.get_ranks_via_pipeline_stage(pipeline_rank)
<                                     ranks_in_receive_stage = mpu.get_ranks_via_pipeline_stage(dest_pipeline_rank)
<                                     num_ranks_in_send_stage = len(ranks_in_send_stage)
<                                     num_ranks_in_receive_stage = len(ranks_in_receive_stage)
< 
<                                     tp_size, dp_size = mpu.get_op_tp_size(op.op_index), mpu.get_op_dp_size(op.op_index)
<                                     tp_size_next, dp_size_next = mpu.get_op_tp_size(op_index), mpu.get_op_dp_size(op_index)
< 
<                                     for i in range(num_ranks_in_send_stage):
<                                         if ranks_in_send_stage[i] == rank:
<                                             dp_id = i // tp_size
<                                             tp_id = i % tp_size
< 
<                                     next_dp_id = [dp_id]
<                                     next_tp_id = [tp_id]
< 
<                                     if tp_size_next > tp_size:
<                                         ratio = tp_size_next // tp_size
<                                         next_tp_id = range(tp_id * ratio, (tp_id + 1)*ratio)                                    
<                                     if tp_size_next < tp_size:
<                                         ratio = tp_size // tp_size_next
<                                         next_tp_id = [tp_id // ratio]  
<                                     if dp_size_next > dp_size:
<                                         ratio = dp_size_next // dp_size
<                                         next_dp_id = range(dp_id * ratio, (dp_id + 1)*ratio)                                      
<                                     if dp_size_next < dp_size:
<                                         ratio = dp_size // dp_size_next
<                                         if dp_id % ratio == 0:
<                                             next_dp_id = [dp_id // ratio] 
<                                         else:
<                                             next_dp_id = []
< 
<                                     op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index] = []
<                                     if len(next_dp_id) > 0:
<                                         for _dp_id in next_dp_id:
<                                             tmp_list = []
<                                             for _tp_id in next_tp_id:
<                                                 tmp_list.append(ranks_in_receive_stage[_dp_id * tp_size_next + _tp_id])
<                                             op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index].append(list(tmp_list))
<                         else:
<                             assert len(op.shared_weights_info[key]["sharing_with_ops"]) == 1
<                             op_index = op.shared_weights_info[key]["sharing_with_ops"][0]
<                             src_pipeline_rank = mpu.get_pipeline_rank_via_op_index(op_index)
<                             if src_pipeline_rank == pipeline_rank:
<                                 op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index] = True
<                             else:
<                                 op.shared_weights_info[key]["sharing_weights_in_same_pipeline_rank"][op_index] = False
< 
<                                 ranks_in_send_stage = mpu.get_ranks_via_pipeline_stage(src_pipeline_rank)
<                                 ranks_in_receive_stage = mpu.get_ranks_via_pipeline_stage(pipeline_rank)
<                                 num_ranks_in_send_stage = len(ranks_in_send_stage)
<                                 num_ranks_in_receive_stage = len(ranks_in_receive_stage)
< 
<                                 tp_size, dp_size = mpu.get_op_tp_size(op.op_index), mpu.get_op_dp_size(op.op_index)
<                                 tp_size_next, dp_size_next = mpu.get_op_tp_size(op_index), mpu.get_op_dp_size(op_index)
< 
<                                 for i in range(num_ranks_in_receive_stage):
<                                     if ranks_in_receive_stage[i] == rank:
<                                         dp_id = i // tp_size
<                                         tp_id = i % tp_size
< 
<                                 next_dp_id = [dp_id]
<                                 next_tp_id = [tp_id]
< 
<                                 if tp_size_next > tp_size:
<                                     ratio = tp_size_next // tp_size
<                                     next_tp_id = range(tp_id * ratio, (tp_id + 1)*ratio)                                    
<                                 if tp_size_next < tp_size:
<                                     ratio = tp_size // tp_size_next
<                                     next_tp_id = [tp_id // ratio]  
<                                 if dp_size_next > dp_size:
<                                     ratio = dp_size_next // dp_size
<                                     next_dp_id = [dp_id * ratio]                                 
<                                 if dp_size_next < dp_size:
<                                     ratio = dp_size // dp_size_next
<                                     next_dp_id = [dp_id // ratio]   
< 
<                                 op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index] = []
< 
<                                 for _dp_id in next_dp_id:
<                                     tmp_list = []
<                                     for _tp_id in next_tp_id:
<                                         tmp_list.append(ranks_in_send_stage[_dp_id * tp_size_next + _tp_id])
<                                     op.shared_weights_info[key]["sharing_weights_with_ranks"][op_index].append(list(tmp_list))
< 
<         # send & receive tensors
<         for model in models:
<             model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) 
<             for op in model.language_model.ops:
<                 if len(op.shared_weights_info) > 0:
<                     is_root = False 
<                     for key in op.shared_weights_info:
<                         if op.shared_weights_info[key]["root"]:
<                             is_root = True
<                     if is_root:
<                         send_shared_tensors(op, models, grads=False)
<                     else:
<                         recv_tensor = recv_shared_tensors(op, models, grads=False)
<                         op.set_shared_tensor(recv_tensor, grads=False)
<         
< 
< def synchronize_shared_weights_grads(models):
<     if ENABLE_WEIGHT_SHARE:
<         for model in models:
<             model = unwrap_model(model, (torchDDP, LocalDDP, Float16Module)) 
<             # two-phase to avoid deadlock
<             # Phase 1: root: receive, sum up, send out
<             #          workers: send
<             for op in model.language_model.ops:
<                 if len(op.shared_weights_info) > 0:
<                     is_root = False
<                     for key in op.shared_weights_info:
<                         if op.shared_weights_info[key]["root"]:
<                             is_root = True                
<                     if is_root:
<                         grads_dict = {}
<                         recv_grads_dict = recv_shared_tensors(op, models, grads=True)
<                         current_grads_dict = op.get_shared_tensor(grads=True)
<                         for key in sorted(op.shared_weights_info):
<                             # receive grads from all sync-ops.
<                             recv_grads = recv_grads_dict[key]
<                             # sum up the grads from all sync-ops and this op.
<                             current_grads = current_grads_dict[key]
<                             recv_grads.append(current_grads)
<                             grads_dict[key] = [sum(recv_grads)]               
<                         op.set_shared_tensor(grads_dict, grads=True)                    
<                         # send sum of grads back to all the sync-ops.                  
<                         send_shared_tensors(op, models, grads=True)                   
<                     else:
<                         # send grads to root op. 
<                         send_shared_tensors(op, models, grads=True)
< 
<             # Phase 2: workers: receive
<             for op in model.language_model.ops:
<                 if len(op.shared_weights_info) > 0:
<                     is_root = False
<                     for key in op.shared_weights_info:
<                         if op.shared_weights_info[key]["root"]:
<                             is_root = True                  
<                     if not is_root:               
<                         # recv sum of grads.
<                         recv_grads = recv_shared_tensors(op, models, grads=True)
<                         # update grads.
<                         op.set_shared_tensor(recv_grads, grads=True)
504,505c200
<        args.virtual_pipeline_model_parallel_size is not None \
<            and args.virtual_pipeline_model_parallel_size > 1:
---
>        args.virtual_pipeline_model_parallel_size is not None:
536,538d230
<         if DEBUG_FIX_WEIGHT:
<             set_weight(model_module)            
< 
552,553d243
<     initialize_weights_sharing(model)
< 
621a312
> 
622a314
> 
627a320
> 
666,669c359,360
< 
<     ## New Megatron
<     if optimizer is not None:
<         optimizer.zero_grad()    
---
>     else:
>         optimizer.zero_grad()
673,679c364,367
<             if args.virtual_pipeline_model_parallel_size > 1:
<                 forward_backward_func = forward_backward_pipelining_with_interleaving
<                 assert get_num_microbatches() % args.pipeline_model_parallel_size == 0, \
<                     'number of microbatches is not divisible by pipeline-parallel ' \
<                     'size when using interleaved schedule'
<             else:
<                 forward_backward_func = forward_backward_pipelining_without_interleaving
---
>             forward_backward_func = forward_backward_pipelining_with_interleaving
>             assert get_num_microbatches() % args.pipeline_model_parallel_size == 0, \
>                 'number of microbatches is not divisible by pipeline-parallel ' \
>                 'size when using interleaved schedule'
684d371
< 
689,692d375
<     # Empty unused memory (From new Megatron)
<     if args.empty_unused_memory_level >= 1:
<         torch.cuda.empty_cache()
< 
700,711d382
<     if DEBUG_GRAD:
<         string = f"=================== grad info BEFORE sync [rank {torch.distributed.get_rank()}] ==================="
<         with open(f"{args.log_path}{args.log_name}_debug_grad_rank_{torch.distributed.get_rank()}.log", "a+") as f:
<             f.write(string+"\n")    
<         total_size = 0
<         for name, params in model[0].named_parameters():
<             param_size = list(params.data.size())
<             string = f"[DEBUG] param name {name}, grad_requires: {params.requires_grad},\n weight({params.data.dtype}): {params.data} \n grad_value ({params.main_grad.dtype}): {params.main_grad}"
<             with open(f"{args.log_path}{args.log_name}_debug_grad_rank_{torch.distributed.get_rank()}.log", "a+") as f:
<                 f.write(string+"\n")  
<         print(f"[TOTAL PARAMS SIZE] {total_size} MB")
< 
713a385,386
>     # This should only run for models that support pipelined model parallelism
>     # (BERT and GPT-2).
715c388,404
<     synchronize_shared_weights_grads(model)
---
>     if (mpu.is_pipeline_first_stage(ignore_virtual=True) or
>         mpu.is_pipeline_last_stage(ignore_virtual=True)) and \
>             mpu.get_pipeline_model_parallel_world_size() > 1:
>         if mpu.is_pipeline_first_stage(ignore_virtual=True):
>             unwrapped_model = model[0]
>         elif mpu.is_pipeline_last_stage(ignore_virtual=True):
>             unwrapped_model = model[-1]
>         unwrapped_model = unwrap_model(
>             unwrapped_model, (torchDDP, LocalDDP, Float16Module))
> 
>         if unwrapped_model.share_word_embeddings:
>             word_embeddings_weight = unwrapped_model.word_embeddings_weight()
>             if args.DDP_impl == 'local':
>                 grad = word_embeddings_weight.main_grad
>             else:
>                 grad = word_embeddings_weight.grad
>             torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())
718,726d406
<     if DEBUG_GRAD:
<         string = f"=================== grad info AFTER sync [rank {torch.distributed.get_rank()}] ==================="
<         with open(f"{args.log_path}{args.log_name}_debug_grad_rank_{torch.distributed.get_rank()}.log", "a+") as f:
<             f.write(string+"\n")    
<         for name, params in model[0].named_parameters():
<             string = f"[DEBUG] param name {name}, grad_requires: {params.requires_grad},\n weight: {params.data} \n grad_value: {params.main_grad}"
<             with open(f"{args.log_path}{args.log_name}_debug_grad_rank_{torch.distributed.get_rank()}.log", "a+") as f:
<                 f.write(string+"\n")   
< 
729,734c409,410
<     update_successful, grad_norm, num_zeros_in_grad = optimizer.step()    
<     timers('optimizer').stop()        
< 
<     # Empty unused memory
<     if args.empty_unused_memory_level >= 2:
<         torch.cuda.empty_cache()
---
>     update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
>     timers('optimizer').stop()
739c415,416
<                     args.micro_batch_size                  
---
>                     args.micro_batch_size * \
>                     args.data_parallel_size
757c434
<                  grad_norm, params_norm, num_zeros_in_grad, model=None):
---
>                  grad_norm, params_norm, num_zeros_in_grad):
798d474
< 
802,803d477
<     add_to_logging('forward-send-backward-recv')
<     add_to_logging('forward-send-forward-recv')
811a486,489
>     add_to_logging('optimizer-copy-to-main-grad')
>     add_to_logging('optimizer-unscale-and-check-inf')
>     add_to_logging('optimizer-clip-main-grad')
>     add_to_logging('optimizer-copy-main-to-model-params')
815,816d492
<     model = unwrap_model(model[0], (torchDDP, LocalDDP, Float16Module)) 
< 
818c494,495
<     batch_size = args.micro_batch_size * get_num_microbatches()
---
>     batch_size = args.micro_batch_size * args.data_parallel_size * \
>         get_num_microbatches()
896d572
< 
901,908c577
<         _time_to_csv = timers.log(timers_to_log, normalizer=args.log_interval)
< 
<         if iteration == (args.train_iters - 1):
<             time_to_csv = [["global_batch_size", "time"] + _time_to_csv[0], [batch_size, f"{elapsed_time_per_iteration * 1000.0:.2f}"] + _time_to_csv[1]]
<             with open(f"{args.log_path}csv/{args.log_name}_stage{mpu.get_pipeline_model_parallel_rank()}_rank{torch.distributed.get_rank()}.csv", mode="w", newline="") as file:
<                 writer = csv.writer(file)
<                 for row in time_to_csv:
<                     writer.writerow(row)
---
>         timers.log(timers_to_log, normalizer=args.log_interval)
910c579
<     return report_memory_flag, elapsed_time_per_iteration * 1000.0
---
>     return report_memory_flag
929a599
> 
942c612
<     
---
> 
944c614
<     # print_datetime('before the start of training step')
---
>     print_datetime('before the start of training step')
946d615
< 
948d616
<         # print(f"iteration {iteration}")
957,959c625,627
< 
<         args.consumed_train_samples += args.micro_batch_size * \
<                                     get_num_microbatches()                                            
---
>         args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
>                                        args.micro_batch_size * \
>                                        get_num_microbatches()
966c634
<         report_memory_flag, iteration_time = training_log(loss_dict, total_loss_dict,
---
>         report_memory_flag = training_log(loss_dict, total_loss_dict,
970c638
<                                           grad_norm, params_norm, num_zeros_in_grad, model)
---
>                                           grad_norm, params_norm, num_zeros_in_grad)
979,984c647,652
<         # if args.eval_interval and iteration % args.eval_interval == 0 and \
<         #    args.do_valid:
<         #     prefix = 'iteration {}'.format(iteration)
<         #     evaluate_and_print_results(prefix, forward_step_func,
<         #                                valid_data_iterator, model,
<         #                                iteration, False)
---
>         if args.eval_interval and iteration % args.eval_interval == 0 and \
>            args.do_valid:
>             prefix = 'iteration {}'.format(iteration)
>             evaluate_and_print_results(prefix, forward_step_func,
>                                        valid_data_iterator, model,
>                                        iteration, False)
1017a686
> 
1057,1059c726,728
<             args.consumed_valid_samples += args.micro_batch_size \
<                                         * get_num_microbatches()
< 
---
>             args.consumed_valid_samples += mpu.get_data_parallel_world_size() \
>                                            * args.micro_batch_size \
>                                            * get_num_microbatches()
1126c795
<     # Data loader only on rank 0 of each model parallel group. 
---
>     # Data loader only on rank 0 of each model parallel group.
1135c804
<                         args.eval_iters
---
>                      args.eval_iters
1138,1139c807,808
<                                         eval_iters * args.global_batch_size,
<                                         test_iters * args.global_batch_size]
---
>                                       eval_iters * args.global_batch_size,
>                                       test_iters * args.global_batch_size]
1157,1164c826,828
<         # do_train = train_dataloader is not None and args.train_iters > 0
<         # do_valid = valid_dataloader is not None and args.eval_iters > 0
<         # do_test = test_dataloader is not None and args.eval_iters > 0
< 
<         do_train = args.train_iters > 0
<         do_valid = args.eval_iters > 0
<         do_test = args.eval_iters > 0
< 
---
>         do_train = train_dataloader is not None and args.train_iters > 0
>         do_valid = valid_dataloader is not None and args.eval_iters > 0
>         do_test = test_dataloader is not None and args.eval_iters > 0
diff --color -r runtime/megatron/utils.py ../Megatron-LM-base/megatron/utils.py
2,9d1
< # Copyright (c) Microsoft Corporation.
< # Licensed under the MIT License.
< 
< # The file has been adapted from the following Megatron-LM file:
< # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/utils.py
< # Git commit hash: 42c1cf4279acea5a554500dcb552211f44cbec45
< # We retain the following copyright from the original files:
< 
40,127c32
< from dataclasses import dataclass, field
< import gc
< import os
< DEBUG_MEM = os.environ.get("DEBUG_MEM", '0') == '1'
< 
< @dataclass
< class OpConfig:
<     name: str
<     prev_name: str
<     input_tensors_info: dict = field(default_factory=dict)
<     output_tensors_info: dict = field(default_factory=dict)
<     input_extra_tensors_info: dict = field(default_factory=dict)
<     output_extra_tensors_info: dict = field(default_factory=dict)    
<     shared_weights_info: dict = field(default_factory=dict)
< 
< snapmap = dict()
< 
< def debug_mem_report(log_name, path=None, return_string=False):
<     '''Report the memory usage of the tensor.storage in pytorch
<     Both on CPUs and GPUs are reported'''
<     
<     def _mem_report(tensors, mem_type):
<         '''Print the selected tensors of type
<         There are two major storage types in our major concern:
<             - GPU: tensors transferred to CUDA devices
<             - CPU: tensors remaining on the system memory (usually unimportant)
<         Args:
<             - tensors: the tensors of specified type
<             - mem_type: 'CPU' or 'GPU' in current implementation '''
<         string = ""
<         # print('Storage on %s' %(mem_type))
<         string += 'Storage on %s\n' %(mem_type)
<         # print('-'*LEN)
<         string += '-'*LEN + "\n"
<         total_numel = 0
<         total_mem = 0
<         visited_data = []
<         string_large = ""
<         string_small = ""
<         for tensor in tensors:
<             if tensor.is_sparse:
<                 continue
<             # a data_ptr indicates a memory block allocated
<             data_ptr = tensor.storage().data_ptr()
<             if data_ptr in visited_data:
<                 continue
<             visited_data.append(data_ptr)
< 
<             numel = tensor.storage().size()
<             total_numel += numel
<             element_size = tensor.storage().element_size()
<             mem = numel*element_size /1024/1024 # 32bit=4Byte, MByte
<             total_mem += mem
<             element_type = type(tensor).__name__
<             size = tuple(tensor.size())
< 
<             # print('%s\t\t%s\t\t%.2f' % (element_type, size, mem) )
<             if mem > 1:
<                 string_large += '%s\t\t%s\t\t%.2f\t\t%d\n' % (element_type, size, mem, data_ptr)
<             else:
<                 string_small += '%s\t\t%s\t\t%.2f\t\t%d\n' % (element_type, size, mem, data_ptr)
<         # print('-'*LEN)
<         string += string_large
<         # string += "\n" + string_small
<         string += '-'*LEN + "\n"
<         # print('Total Tensors: %d \tUsed Memory Space: %.2f MBytes' % (total_numel, total_mem) )
<         string += 'Total Tensors: %d \tUsed Memory Space: %.2f MBytes\n' % (total_numel, total_mem)
<         # print('-'*LEN)
<         string += '-'*LEN + "\n"
<         return string
< 
<     string = ""
<     LEN = 65
<     string += f"================================== rank {torch.distributed.get_rank()} {log_name} ==================================\n"
<     objects = gc.get_objects()
<     string += '%s\t%s\t\t\t%s\n' %('Element type', 'Size', 'Used MEM(MBytes)')
<     tensors = [obj for obj in objects if torch.is_tensor(obj)]
<     cuda_tensors = [t for t in tensors if t.is_cuda]
<     string += _mem_report(cuda_tensors, 'GPU')
<     string += '='*LEN + "\n"
<     if path:
<         with open(path, "a+") as f:
<             f.write(string+"\n")   
<     else:
<         if return_string:
<             return string
<         else:
<             print(string + "\n")        
---
> 
188c93
< def report_memory(name, get_list=False):
---
> def report_memory(name):
190d94
< 
192,196d95
<     allocated = torch.cuda.memory_allocated() / mega_bytes
<     max_allocated = torch.cuda.max_memory_allocated() / mega_bytes
<     reserved = torch.cuda.memory_reserved() / mega_bytes
<     max_reserved = torch.cuda.max_memory_reserved() / mega_bytes
< 
198,208c97,107
<     string += ' | allocated: {}'.format(allocated)
<     string += ' | max allocated: {}'.format(max_allocated)
<     string += ' | reserved: {}'.format(reserved)
<     string += ' | max reserved: {}'.format(max_reserved)
< 
<     if get_list:
<         mem_to_csv = [["allocated", "max_allocated", "reserved", "max_reserved"], 
<             [f"{allocated:.2f}", f"{max_allocated:.2f}", f"{reserved:.2f}", f"{max_reserved:.2f}"]]
<         return string, mem_to_csv
< 
<     return string
---
>     string += ' | allocated: {}'.format(
>         torch.cuda.memory_allocated() / mega_bytes)
>     string += ' | max allocated: {}'.format(
>         torch.cuda.max_memory_allocated() / mega_bytes)
>     string += ' | reserved: {}'.format(
>         torch.cuda.memory_reserved() / mega_bytes)
>     string += ' | max reserved: {}'.format(
>         torch.cuda.max_memory_reserved() / mega_bytes)
>     if mpu.get_data_parallel_rank() == 0:
>         print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
>               flush=True)