Skip to content

Commit

Permalink
Merge branch 'master' into stage3-use-new-grad-acc-api
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Dec 18, 2024
2 parents 4ebec9d + b344c04 commit d3016ad
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 22 deletions.
45 changes: 24 additions & 21 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -8,49 +8,52 @@

# top-level repo folders
/.github/ @loadams
/azure/ @awan-10
/benchmarks/ @awan-10 @tjruwase
/azure/ @loadams
/benchmarks/ @guanhuawang @tjruwase
/bin/ @loadams
/csrc/ @awan-10
/csrc/ @tjruwase
/deepspeed/ @loadams @tjruwase
/docker/ @awan-10
/docker/ @loadams @guanhuawang
/docs/ @loadams @tjruwase
/examples/ @awan-10 @tohtana
/examples/ @jomayeri @tohtana
/op_builder/ @loadams @tjruwase @jomayeri
/release/ @loadams
/release/ @loadams @jomayeri
/requirements/ @loadams
/scripts/ @awan-10
/scripts/ @loadams @tjruwase
/tests/ @tjruwase @loadams @tohtana

# deepspeed
/deepspeed/autotuning/ @loadams
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/comm/ @guanhuawang
/deepspeed/compression/ @tjruwase
/deepspeed/elasticity/ @awan-10
/deepspeed/elasticity/ @tjruwase
/deepspeed/launcher/ @loadams
/deepspeed/module_inject/ @awan-10
/deepspeed/module_inject/ @hwchen2017 @loadams
/deepspeed/moe/ @tohtana
/deepspeed/monitor/ @awan-10
/deepspeed/monitor/ @tjruwase
/deepspeed/nebula/ @tjruwase
/deepspeed/nvme/ @tjruwase @jomayeri
/deepspeed/ops/ @tohtana
/deepspeed/pipe/ @tohtana @loadams
/deepspeed/profiling/ @loadams
/deepspeed/utils/ @tjruwase @awan-10
/deepspeed/sequence/ @tohtana
/deepspeed/utils/ @tjruwase @tohtana

# inference
/deepspeed/inference/ @awan-10
/deepspeed/model_implementations/ @awan-10
/deepspeed/inference/ @hwchen2017 @tohtana
/deepspeed/model_implementations/@tohtana @loadams

# training
/deepspeed/runtime/ @tjruwase @tohtana
/deepspeed/runtime/activation_checkpointing/ @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10
/deepspeed/runtime/comm/ @guanhuawang
/deepspeed/runtime/compression/ @tjruwase
/deepspeed/runtime/data_pipeline/ @tjruwase
/deepspeed/runtime/fp16/ @tjruwase
/deepspeed/runtime/fp16/onebit/ @awan-10
/deepspeed/runtime/pipe/ @loadams
/deepspeed/runtime/swap_tensor/ @tjruwase
/deepspeed/runtime/zero/ @tjruwase
/deepspeed/runtime/domino/ @guanhuawang @hwchen2017
/deepspeed/runtime/fp16/ @tjruwase @tohtana
/deepspeed/runtime/fp16/onebit/ @tjruwase
/deepspeed/runtime/pipe/ @loadams @tohtana
/deepspeed/runtime/swap_tensor/ @tjruwase @jomayeri
/deepspeed/runtime/zero/ @tjruwase @tohtana
6 changes: 5 additions & 1 deletion deepspeed/module_inject/auto_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,15 @@ def _replace(self, child, name, conv_linear_layer):
weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(),
dist.get_world_size(), False)
return LinearAllreduce(weight, bias, self.mp_group)
# For Arctic model, bypass to all_reduce replacement for w2 weights
arctic_w2_all_reduce_linear = False
if 'Arctic' in str(self.module) and 'w2' in name:
arctic_w2_all_reduce_linear = True
# For MLP including chunk layer.
if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
return LinearLayer(weight=weight, bias=bias)
if name in self.all_reduce_linears:
if name in self.all_reduce_linears or arctic_w2_all_reduce_linear:
# if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
# else [weight_shape[0], weight_shape[1] // mp_size]

Expand Down
1 change: 1 addition & 0 deletions docs/_tutorials/automatic-tensor-parallelism.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ The following results were collected using V100 SXM2 32GB GPUs.
The following model families have been successfully tested with automatic tensor parallelism. Other models may work but have not been tested yet.

- albert
- arctic
- baichuan
- bert
- bigbird_pegasus
Expand Down

0 comments on commit d3016ad

Please sign in to comment.