microsoft · tjruwase · Oct 25, 2023 · Jul 20, 2023 · Jul 21, 2023 · Jul 21, 2023
@@ -14,6 +14,7 @@
 from .layers import LinearAllreduce, LinearLayer
 from deepspeed.accelerator import get_accelerator
 from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw
+from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list
 
 
 class ReplaceWithTensorSlicing:
@@ -308,8 +309,9 @@ def _replace(self, child, name, conv_linear_layer):
 
             if self.conv_linear_layer:
                 child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-            data = child.weight.data.split(
-                (weight_shape[0] if self.conv_linear_layer else weight_shape[1]) // self.mp_size, dim=1)
+            data = child.weight.data.split(get_shard_size_list(
+                weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size),
+                                           dim=1)
             data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
 
             setattr(child, "replaced", True)
@@ -332,13 +334,14 @@ def _replace(self, child, name, conv_linear_layer):
                     module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to(
                         get_accelerator().current_device_name())
             else:
-                data = child.weight.data.split((weight_shape[0]) // self.mp_size,
+                data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size),
                                                dim=1 if self.conv_linear_layer else 0)
                 data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
 
                 if child.bias is not None:
-                    bias_data = child.bias.data.split(
-                        (weight_shape[1] if self.conv_linear_layer else weight_shape[0]) // self.mp_size, dim=0)
+                    bias_data = child.bias.data.split(get_shard_size_list(
+                        weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size),
+                                                      dim=0)
                     bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
                     bias_data = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
                 else:
@@ -354,13 +357,13 @@ def _slice_embedding(self, child, name, conv_linear_layer):
         mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
 
         if hasattr(child.weight, 'ds_tensor'):
-            data = child.weight.ds_tensor.data.split(child.weight.shape[1] // self.mp_size, dim=1)
+            data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
         else:
-            data = child.weight.data.split(child.weight.shape[1] // self.mp_size, dim=1)
+            data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
         data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
         data = torch.nn.parameter.Parameter(data, requires_grad=False)
 
-        new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // self.mp_size)
+        new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size))
         new_embedding.weight.data.copy_(data)
         setattr(child, "replaced", True)
         return new_embedding
@@ -374,8 +377,8 @@ def update_mp_params(self, child):
         ]:
             if hasattr(child, param):
                 param_val = getattr(child, param)
-                assert param_val % self.mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({self.mp_size})"
-                setattr(child, param, param_val // self.mp_size)
+                #assert param_val % self.mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({self.mp_size})"
+                setattr(child, param, get_shard_size(param_val, self.mp_size))
         setattr(child, "replaced", True)
 
     def update_linear_policies(self):

@@ -6,6 +6,7 @@
 from deepspeed import comm as dist
 import torch
 from typing import Optional
+from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list
 
 
 def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
@@ -51,8 +52,8 @@ def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype
     arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
     alibi = slopes[..., None] * arange_tensor
     if dist.is_initialized():
-        num_heads_per_rank = int(num_heads / dist.get_world_size())
-        offset = dist.get_rank() * num_heads_per_rank
+        num_heads_per_rank = get_shard_size(num_heads, dist.get_world_size())
+        offset = sum(get_shard_size_list(num_heads, dist.get_world_size())[0:dist.get_rank()])
         alibi = alibi.view(batch_size, num_heads, 1, seq_length)
         alibi = alibi[:, offset:num_heads_per_rank + offset, :, :]
         return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype)
@@ -72,8 +73,8 @@ def build_mpt_atten_bias_tensor(self,
                                                        prefix_mask=prefix_mask,
                                                        sequence_id=sequence_id)
     if dist.is_initialized():
-        num_heads_per_rank = int(self.config.n_heads / dist.get_world_size())
-        offset = dist.get_rank() * num_heads_per_rank
+        num_heads_per_rank = get_shard_size(self.config.n_heads, dist.get_world_size())
+        offset = sum(get_shard_size_list(self.config.n_heads, dist.get_world_size())[0:dist.get_rank()])
         attn_bias = attn_bias[:, offset:num_heads_per_rank + offset, :, :]
     return attn_bias, attention_mask
 

@@ -4,6 +4,8 @@
 # DeepSpeed Team
 import torch
 from deepspeed.utils.logging import warning_once
+from deepspeed.utils.tp_shard import get_shard_size, get_shard_size_list
+import deepspeed.utils.tp_shard as tp_shard
 import re
 
 
@@ -39,18 +41,19 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index):
 
     def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
         # codegen_mp_num defined in https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py
-        #TODO: assert num_heads % (mp_size*codegen_mp_num) == 0
+        assert tp_shard.num_kv_heads % (
+            mp_size * codegen_mp_num) == 0, "codgen autoTP requires num_kv_heads % (mp_size*codegen_mp_num) == 0"
         #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
 
         shape = input.shape
-        dst_shape = shape[0] // mp_size
+        dst_shape = get_shard_size(shape[0], mp_size)
         num_mp_blocks = input.reshape(codegen_mp_num, shape[0] // codegen_mp_num, shape[1])
 
         #num_mp_blocks : [codegen_mp_num, 3*hidden_dim/codegen_mp_num, :]
         src_split = list(torch.split(num_mp_blocks, num_mp_blocks.shape[1] // 3, dim=1))
         src_split = [x.reshape(codegen_mp_num * mp_size, -1, shape[1]) for x in src_split]
 
-        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, shape[0] // 3 // mp_size, 0, 1)
+        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size(shape[0] // 3, mp_size), 0, 1)
         tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0).reshape(shape[0], -1)
 
         return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape]
@@ -59,16 +62,16 @@ def _glm_type_transpose(input, mp_size):
         #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
 
         shape = input.shape
-        dst_shape = shape[0] // mp_size
         src_split = torch.split(input, shape[0] // 3, dim=0)
 
-        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, shape[0] // 3 // mp_size)
-        tp_fuseqkv_weight = torch.cat(split_fusedqkv, dim=0)
-
-        return tp_fuseqkv_weight[gpu_index * dst_shape:(gpu_index + 1) * dst_shape]
+        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
+        return split_fusedqkv[gpu_index]
 
     def _bloom_type_transpose(input, mp_size):
-        return input
+        shape = input.shape
+
+        split_fusedqkv = input.split(get_shard_size_list(shape[0], mp_size), dim=0)
+        return split_fusedqkv[gpu_index]
 
     def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None):
 
@@ -91,4 +94,4 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None):
             return _transpose_fused_qkvw(src, mp_size, fused_type)
     warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type,"
                  f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
-    return src
+    return _bloom_type_transpose(src, mp_size)
@@ -16,6 +16,7 @@
 from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading
 
 from deepspeed import comm as dist
+from deepspeed.utils.tp_shard import set_num_kv_heads
 
 from .load_checkpoint import load_model_with_checkpoint
 import time
@@ -271,10 +272,22 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         # 2. Set the tensor parallelism config
         _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
-        # 3. Set linear policies
+        # 3. Try to get num_key_heads from model_config.num_key_value_heads
+        num_kv_heads = None
+        kv_head_names = ['num_key_value_heads', 'num_attention_heads', 'n_heads']
+        for name in kv_head_names:
+            if hasattr(model_config, name):
+                num_kv_heads = getattr(model_config, name)
+                if num_kv_heads != None:
+                    break
+
+        # 5. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
+        set_num_kv_heads(num_kv_heads)
+
+        # 6. Set linear policies
         _autotp.update_linear_policies()
 
-        # 4. Replace modules
+        # 7. Replace modules
         return _autotp._replace_module(module)
 
     def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):

diff --git a/deepspeed/utils/tp_shard.py b/deepspeed/utils/tp_shard.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed import comm as dist
+global num_kv_heads
+
+
+def set_num_kv_heads(num):
+    global num_kv_heads
+    num_kv_heads = num
+
+
+def get_shard_size(total_size, mp_size, rank=None):
+    global num_kv_heads
+    # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
+    # In the case that total_size cannot be divided by num_kv_heads, only even sharding is possible
+    if num_kv_heads != None and (total_size % num_kv_heads) == 0:
+        if (rank == None):
+            rank = dist.get_rank()
+        my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
+        return (total_size // num_kv_heads) * my_slices
+    else:
+        if total_size % mp_size == 0:
+            return total_size // mp_size
+        else:
+            assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})"
+
+
+def get_shard_size_list(total_size, mp_size):
+    shard_sizes = []
+    for i in range(mp_size):
+        shard_sizes.append(get_shard_size(total_size, mp_size, i))
+    return shard_sizes
@@ -544,6 +544,39 @@ def test(
         print(local_rank, "deepspeed", ds_output)
         assert assert_fn(bs_output, ds_output)
 
+    @pytest.mark.world_size(3)
+    def test_odd_world_size(
+        self,
+        model_w_task,
+        query,
+        inf_kwargs,
+        assert_fn,
+        dtype,
+    ):
+        invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
+        if invalid_test_msg:
+            pytest.skip(invalid_test_msg)
+
+        model, task = model_w_task
+        if model == "Salesforce/codegen-350M-mono":
+            pytest.skip("codegen does not supported by odd world_size")
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "3"))
+
+        # We have to load these large models on CPU with pipeline because not
+        # enough GPU memory
+        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        bs_output = pipe(query, **inf_kwargs)
+
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
+        # Switch device to GPU so that input tensors are not on CPU
+        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        ds_output = pipe(query, **inf_kwargs)
+
+        print(local_rank, "baseline", bs_output)
+        print(local_rank, "deepspeed", ds_output)
+        assert assert_fn(bs_output, ds_output)
+
 
 @pytest.mark.nightly
 @pytest.mark.parametrize(