Merge branch 'master' into loadams/inference-ops-test-repro

microsoft · Dec 18, 2024 · a238f3d · a238f3d
2 parents 735ac4c + b344c04
commit a238f3d
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 23 deletions.
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
@@ -4,6 +4,7 @@ on:
   workflow_dispatch:
   pull_request:
     paths:
+      - 'accelerator/**'
       - '.github/workflows/no-torch.yml'
       - 'op_builder/**'
   schedule:

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -8,49 +8,52 @@
 
 # top-level repo folders
 /.github/ @loadams
-/azure/ @awan-10
-/benchmarks/ @awan-10 @tjruwase
+/azure/ @loadams
+/benchmarks/ @guanhuawang @tjruwase
 /bin/ @loadams
-/csrc/ @awan-10
+/csrc/ @tjruwase
 /deepspeed/ @loadams @tjruwase
-/docker/ @awan-10
+/docker/ @loadams @guanhuawang
 /docs/ @loadams @tjruwase
-/examples/ @awan-10 @tohtana
+/examples/ @jomayeri @tohtana
 /op_builder/ @loadams @tjruwase @jomayeri
-/release/ @loadams
+/release/ @loadams @jomayeri
 /requirements/ @loadams
-/scripts/ @awan-10
+/scripts/ @loadams @tjruwase
 /tests/ @tjruwase @loadams @tohtana
 
 # deepspeed
 /deepspeed/autotuning/ @loadams
 /deepspeed/checkpoint/ @tjruwase
-/deepspeed/comm/ @awan-10
+/deepspeed/comm/ @guanhuawang
 /deepspeed/compression/ @tjruwase
-/deepspeed/elasticity/ @awan-10
+/deepspeed/elasticity/ @tjruwase
 /deepspeed/launcher/ @loadams
-/deepspeed/module_inject/ @awan-10
+/deepspeed/module_inject/ @hwchen2017 @loadams
 /deepspeed/moe/ @tohtana
-/deepspeed/monitor/ @awan-10
+/deepspeed/monitor/ @tjruwase
 /deepspeed/nebula/ @tjruwase
+/deepspeed/nvme/ @tjruwase @jomayeri
 /deepspeed/ops/ @tohtana
 /deepspeed/pipe/ @tohtana @loadams
 /deepspeed/profiling/ @loadams
-/deepspeed/utils/ @tjruwase @awan-10
+/deepspeed/sequence/ @tohtana
+/deepspeed/utils/ @tjruwase @tohtana
 
 # inference
-/deepspeed/inference/ @awan-10
-/deepspeed/model_implementations/ @awan-10
+/deepspeed/inference/ @hwchen2017 @tohtana
+/deepspeed/model_implementations/@tohtana @loadams
 
 # training
 /deepspeed/runtime/ @tjruwase @tohtana
 /deepspeed/runtime/activation_checkpointing/ @tjruwase
 /deepspeed/runtime/checkpoint_engine/ @tjruwase
-/deepspeed/runtime/comm/ @awan-10
-/deepspeed/runtime/compression/ @awan-10
+/deepspeed/runtime/comm/ @guanhuawang
+/deepspeed/runtime/compression/ @tjruwase
 /deepspeed/runtime/data_pipeline/ @tjruwase
-/deepspeed/runtime/fp16/ @tjruwase
-/deepspeed/runtime/fp16/onebit/ @awan-10
-/deepspeed/runtime/pipe/ @loadams
-/deepspeed/runtime/swap_tensor/ @tjruwase
-/deepspeed/runtime/zero/ @tjruwase
+/deepspeed/runtime/domino/ @guanhuawang @hwchen2017
+/deepspeed/runtime/fp16/ @tjruwase @tohtana
+/deepspeed/runtime/fp16/onebit/ @tjruwase
+/deepspeed/runtime/pipe/ @loadams @tohtana
+/deepspeed/runtime/swap_tensor/ @tjruwase @jomayeri
+/deepspeed/runtime/zero/ @tjruwase @tohtana
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
@@ -3,9 +3,15 @@
 
 # DeepSpeed Team
 
-import torch
 from .abstract_accelerator import DeepSpeedAccelerator
 
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch
+except ImportError as e:
+    pass
+
 try:
     import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
     oneccl_imported_p = True

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
@@ -346,11 +346,15 @@ def _replace(self, child, name, conv_linear_layer):
                 weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(),
                                                          dist.get_world_size(), False)
                 return LinearAllreduce(weight, bias, self.mp_group)
+        # For Arctic model, bypass to all_reduce replacement for w2 weights
+        arctic_w2_all_reduce_linear = False
+        if 'Arctic' in str(self.module) and 'w2' in name:
+            arctic_w2_all_reduce_linear = True
         # For MLP including chunk layer.
         if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
             weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
             return LinearLayer(weight=weight, bias=bias)
-        if name in self.all_reduce_linears:
+        if name in self.all_reduce_linears or arctic_w2_all_reduce_linear:
             # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
             # else [weight_shape[0], weight_shape[1] // mp_size]
 

diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -121,6 +121,7 @@ The following results were collected using V100 SXM2 32GB GPUs.
 The following model families have been successfully tested with automatic tensor parallelism. Other models may work but have not been tested yet.
 
 - albert
+- arctic
 - baichuan
 - bert
 - bigbird_pegasus
-Original file line number
+Diff line change
@@ Expand Up @@
     The following model families have been successfully tested with automatic tensor parallelism. Other models may work but have not been tested yet.
     - albert
+    - arctic
     - baichuan
     - bert
     - bigbird_pegasus
@@ Expand Down @@