Skip to content

Commit

Permalink
Add MLP/lm_head tp grain size setting. (#6828)
Browse files Browse the repository at this point in the history
This PR aims to add MLP/lm_head tp size granularity setting to
deepspeed.init_inference() API. It will be more flexible to set the
MLP/lm_head sharding grain size.

DNN library favors tensor size in granularity of power of 2, we pick 64
as a default size.

We aim to be able to set the MLP/lm_head tp grain size flexibly. This is
a preliminary solution. If there is a better solution, we can discuss it
together. Thanks~

---------

Co-authored-by: Logan Adams <[email protected]>
Co-authored-by: Olatunji Ruwase <[email protected]>
  • Loading branch information
3 people authored Dec 16, 2024
1 parent 87c6506 commit da771ed
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
3 changes: 3 additions & 0 deletions deepspeed/inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel):
tp_size: int = 1
""" Number of devices to split the model across using tensor parallelism. """

tp_grain_size: int = 64
"Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size."

mpu: object = None
"""
A model parallelism unit object that implements
Expand Down
5 changes: 4 additions & 1 deletion deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .layers import TensorParallelOcShardConv2d, TensorParallelIcShardConv2d

from deepspeed import comm as dist
from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads
from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads, set_tp_grain_size

from .load_checkpoint import load_model_with_checkpoint
import time
Expand Down Expand Up @@ -303,6 +303,9 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
if hasattr(model_config, 'num_attention_heads'):
set_num_attention_heads(getattr(model_config, 'num_attention_heads'))

# 4.4 set tp_grain_size
set_tp_grain_size(config.tensor_parallel.tp_grain_size)

# 5. Set linear policies
_autotp.update_linear_policies()

Expand Down
11 changes: 8 additions & 3 deletions deepspeed/module_inject/tp_shard.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ def set_n_embd(num):
n_embd = num


def set_tp_grain_size(num):
global tp_grain_size
tp_grain_size = num


def get_num_kv_heads():
global num_kv_heads
if 'num_kv_heads' in globals():
Expand All @@ -45,9 +50,9 @@ def get_shard_size(total_size, mp_size, name=None, rank=None):
my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
return total_size * my_slices // num_kv_heads
else:
if total_size >= 64:
grain_size = total_size // 64
return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64
if total_size >= tp_grain_size:
grain_size = total_size // tp_grain_size
return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * tp_grain_size
else:
return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0)

Expand Down

0 comments on commit da771ed

Please sign in to comment.