Merge branch 'master' into fix_bwd_memcpy_4_infinity

microsoft · Oct 28, 2024 · 926451b · 926451b
2 parents 45f03f6 + 229960a
commit 926451b
Show file tree

Hide file tree

Showing 15 changed files with 367 additions and 24 deletions.
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
@@ -390,9 +390,14 @@ def init_device_mesh(self, mesh_shape, mesh_dim_names):
         if not required_torch_version(min_version=2.2):
             raise RuntimeError(f"Current torch version does not have device mesh"
                                f"api (torch.__version__: {torch.__version__})")
-        return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(),
-                                                              mesh_shape,
-                                                              mesh_dim_names=mesh_dim_names)
+        if not required_torch_version(max_version=2.4):
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
+        else:
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
 
 
 # This will become a light-weight wrapper around torch.distributed functions

diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
@@ -345,7 +345,7 @@ def set_lm_head(module):
                                                       "weight") and not module.embed_out.weight.is_meta and isinstance(
                                                           module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
-        elif hasattr(module.language_model, "lm_head"):
+        elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"):
             module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
         return module
 

diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
@@ -24,7 +24,9 @@ def set_n_embd(num):
 
 def get_num_kv_heads():
     global num_kv_heads
-    return num_kv_heads
+    if 'num_kv_heads' in globals():
+        return num_kv_heads
+    return None
 
 
 def get_num_attention_heads():

diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
@@ -115,7 +115,7 @@ def start_time_hook(module, input):
                 get_accelerator().synchronize()
                 module.__start_time__ = time.time()
 
-            if not hasattr(module, "__start_time_hook_handle"):
+            if not hasattr(module, "__start_time_hook_handle__"):
                 module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
 
             def end_time_hook(module, input, output):

diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
@@ -5,6 +5,15 @@
 
 import torch
 
+try:
+    from torch.compiler import is_compiling as torch_is_compiling
+except ImportError:
+    try:
+        from torch._dynamo.external_utils import is_compiling as torch_is_compiling
+    except ImportError:
+        # Torch does not have compiler support
+        torch_is_compiling = lambda: False
+
 
 def is_compile_supported():
     return hasattr(torch, "compiler") and hasattr(torch.nn.Module, "compile")
@@ -14,3 +23,7 @@ def disable(func):
     if is_compile_supported():
         return torch.compiler.disable(func)
     return func
+
+
+def is_compiling():
+    return torch_is_compiling()
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
@@ -38,7 +38,7 @@ def _apply_forward_and_backward_to_tensors_only(module, forward_function, backwa
 
 class ZeROOrderedDict(OrderedDict):
 
-    def __init__(self, parent_module=None, *args, **kwargs):
+    def __init__(self, parent_module, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
         Args:
@@ -49,13 +49,18 @@ def __init__(self, parent_module=None, *args, **kwargs):
         self._parent_module = parent_module
         self._in_forward = False
 
+    def __reduce__(self):
+        r0, _, *r2 = super().__reduce__()
+        return (r0, (self._parent_module, )) + r2
+
     def __getitem__(self, key):
         param = super().__getitem__(key)
 
         # Params can be registered as None (e.g., bias)
         if param is None:
             return param
 
+        # TODO: only weaken this check during compilation
         if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -1070,14 +1070,10 @@ def average_tensor(self, tensor):
             for i, param, param_id in self.params_in_ipg_bucket:
 
                 process_group = self.dp_process_group
-                grad_reduc = self.get_gradient_for_reduction(param)
-                #Averages gradients at parameter level if ipg has a moe param
-                #Otherwise averaging is done at the entire buffer level at the end of the loop
-                # MoE param have different groups
+
                 if self.ipg_bucket_has_moe_params:
                     process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
                         param) else self.dp_process_group
-                    grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
 
                 partition_ids = self.param_to_partition_ids[i][param_id]
                 assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
@@ -1116,8 +1112,7 @@ def average_tensor(self, tensor):
                     curr_size += numel
                     prev_id, prev_process_group = partition_id, process_group
 
-            if not self.ipg_bucket_has_moe_params:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
+            tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
 
             buckets = {}
             for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):

diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
@@ -10,6 +10,8 @@
 
 import deepspeed.comm as dist
 from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import get_shard_size_list, set_num_kv_heads, get_num_kv_heads
+from deepspeed.utils import groups
 
 
 def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len, num_head, head_dim):
@@ -38,8 +40,132 @@ def post_func(input):
     return post_func
 
 
+def uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group):
+    seq_world_size = dist.get_world_size(group)
+    inp_shape = list(input.shape)
+    assert batch_dim_idx in [0, 1], "batch_dim_idx must be either 0 or 1"
+
+    if not (scatter_idx < 2):
+        input_splits = get_shard_size_list(inp_shape[scatter_idx], seq_world_size)
+        input = input.transpose(0, scatter_idx).contiguous()
+        local_heads = input_splits[groups._get_sequence_parallel_rank()]
+        output_splits = [local_heads] * seq_world_size
+
+        output_buffer_shape = [seq_world_size * local_heads] + list(input.shape[1:])
+        output = torch.empty(output_buffer_shape, device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits,\
+            input_split_sizes=input_splits,group=group)
+        ###[seq_ws*local_heads, ...] to [seq_ws, local_heads, ...]
+        output = output.view(seq_world_size, local_heads, *output.shape[1:])
+        ###[seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+
+        ### batch_dim_idx=0 [seq_ws,local_heads,seq_len,b,...] to [b, seq_ws, seq_len, local_heads ...]
+        ### batch_dim_idx=1 [seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+        if batch_dim_idx == 0:
+            order = [3, 0, 2, 1] + list(range(4, len(output.shape)))
+            output = output.permute(order).contiguous()
+            ###[b, seq_ws*local_seq_len, local_heads,...]
+            output = output.view(output.shape[0], inp_shape[gather_idx] * seq_world_size,
+                                 *output.shape[3:]).contiguous()
+        elif batch_dim_idx == 1:
+            output = output.transpose(1, 3).contiguous()
+            ###[seq_ws*local_seq_len, b, local_heads,...]
+            output = output.view(inp_shape[gather_idx] * seq_world_size, *output.shape[2:]).contiguous()
+    else:
+        # The compatibility handling of 4D and 3D tensors, standardizing to 3D.
+        input = input.reshape(input.shape[0], input.shape[1], -1)
+
+        if batch_dim_idx == 0:  #b,s,h
+            input = input.permute(1, 2, 0).contiguous()  #s,h,b
+        elif batch_dim_idx == 1:  #s,b,h
+            input = input.transpose(1, 2).contiguous()  #s,h,b
+        seq_len, h, batch_size = input.shape
+        num_local_heads_list = get_shard_size_list(get_num_kv_heads(), seq_world_size)
+        local_heads = num_local_heads_list[groups._get_sequence_parallel_rank()]
+        h_dim = h // local_heads
+        local_seq_len = seq_len // seq_world_size
+
+        input = input.view(seq_len * h, batch_size)
+        local_seq_len_with_heads = int(input.shape[0] / seq_world_size)  # dim size of local_seq_len*local_heads*hdim
+        input_splits = [local_seq_len_with_heads] * seq_world_size
+        coeff = local_seq_len_with_heads // local_heads  #per head: dim size of local_seq_len*hdim
+
+        #uneven seq_world_size coeff,  total_heads/local_heads.
+        heads_scale_coeff = get_num_kv_heads() / local_heads
+
+        output_splits = [num_local_heads * coeff for num_local_heads in num_local_heads_list]
+        output_buff_d1_size = int(heads_scale_coeff * local_seq_len_with_heads)
+        total_h = int(inp_shape[gather_idx] * heads_scale_coeff)
+        output = torch.empty(output_buff_d1_size, input.shape[1], device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits, \
+            input_split_sizes=input_splits,group=group)
+        ##################
+        #suppose 7 heads divide into 4 ranks [2,2,2,1]
+        #chunk_num_heads_small=floor(7/4)=1
+        #chunk_num_heads_large=ceil(7/4)=2
+        #num_chunk_heads_large=len([2,2,2])=3, all2all_buffer_counts
+        #num_chunk_heads_small=len([1])=1, all2all_buffer_counts
+        #total_num_large_heads=sum([2,2,2])=7
+        #total_num_small_heads=sum([1])=1
+
+        chunk_num_heads_small = get_num_kv_heads() // seq_world_size  # even heads compatible
+        chunk_num_heads_large = chunk_num_heads_small + 1
+        num_chunk_heads_large = get_num_kv_heads() % seq_world_size
+        num_chunk_heads_small = seq_world_size - num_chunk_heads_large
+        total_num_large_heads = num_chunk_heads_large * chunk_num_heads_large
+        total_num_small_heads = num_chunk_heads_small * chunk_num_heads_small
+
+        heads_large_combine_size = coeff * total_num_large_heads
+        heads_small_combine_size = coeff * total_num_small_heads
+        heads_large_chunk, heads_small_chunk = output.split([heads_large_combine_size, heads_small_combine_size],
+                                                            dim=0)
+        heads_large_chunk = heads_large_chunk.view(num_chunk_heads_large, local_seq_len, chunk_num_heads_large, h_dim,
+                                                   batch_size)
+        heads_small_chunk = heads_small_chunk.view(num_chunk_heads_small, local_seq_len, chunk_num_heads_small, h_dim,
+                                                   batch_size)
+        if batch_dim_idx == 0:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[batch,local_seq_len,all2all_buffer_counts*n_heads,dim]
+            order = [4, 1, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_small_heads, h_dim)
+        elif batch_dim_idx == 1:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[local_seq_len,batch,all2all_buffer_counts*n_heads,dim]
+            order = [1, 4, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_small_heads, h_dim)
+
+        output = torch.cat([heads_large_chunk, heads_small_chunk], dim=2).contiguous()
+
+        inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+        output_shape=  inp_shape[: gather_idx] + \
+            [total_h,] + \
+            inp_shape[gather_idx + 1:]
+
+        output = output.view(output_shape)
+
+    return output
+
+
 def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None):
     seq_world_size = dist.get_world_size(group)
+    # we only need num_heads once
+    num_heads = input.shape[2]
+
+    if get_num_kv_heads() is not None or num_heads % seq_world_size != 0:
+        # Assuming here that the number of heads for q is consistent with kv
+        # If not, additional logic is required for cases like GQA
+        if get_num_kv_heads() is None:
+            assert num_heads > seq_world_size, f"Number of heads ({num_heads}) must be larger than sequence parallel size ({seq_world_size})"
+            # set heads at first call by num_total_heads.
+            # then use ``get_num_kv_heads() is not None`` to re-entry uneven path.
+            set_num_kv_heads(num_heads)
+        assert async_op == False, "uneven head sp does not support async op"
+        return uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group)
+
     if batch_dim_idx == 0:
         # b, s, n, h
         if scatter_idx < 2:

diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
@@ -484,6 +484,8 @@ def _get_sequence_parallel_rank():
     global mpu
     if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'):
         return mpu.get_sequence_parallel_rank()
+    if mesh_device is not None:
+        return dist.get_rank(mesh_device.get_group(mesh_dim="sequence_parallel"))
     return 0
 
 

diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
@@ -7,8 +7,7 @@
 import logging
 import sys
 import os
-import torch
-from deepspeed.runtime.compiler import is_compile_supported
+from deepspeed.runtime.compiler import is_compile_supported, is_compiling
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -26,7 +25,7 @@ def create_warning_filter(logger):
 
         def warn_once(record):
             nonlocal warn
-            if is_compile_supported() and torch.compiler.is_compiling() and not warn:
+            if is_compile_supported() and is_compiling() and not warn:
                 warn = True
                 logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
                                " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
@@ -39,7 +38,7 @@ def logging_decorator(func):
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            if torch.compiler.is_compiling():
+            if is_compiling():
                 return
             else:
                 return func(*args, **kwargs)

diff --git a/op_builder/builder.py b/op_builder/builder.py
@@ -67,7 +67,7 @@ def get_default_compute_capabilities():
             # Special treatment of CUDA 11.0 because compute_86 is not supported.
             compute_caps += ";8.0"
         else:
-            compute_caps += ";8.0;8.6"
+            compute_caps += ";8.0;8.6;9.0"
     return compute_caps