Skip to content

Commit

Permalink
[NPU] load EXPORT_ENV based on different accelerators to support mult…
Browse files Browse the repository at this point in the history
…i-node training on other devices (#4830)

Different hardwares may require different environment variables. To
support multi-node training feature for NPU and other devices that rely
on different env vars, I add a method `export_envs()` to each
accelerator and load them in runner.py

For works about NPU, see #4567

Co-authored-by: Olatunji Ruwase <[email protected]>
  • Loading branch information
minchao-sun and tjruwase authored Dec 19, 2023
1 parent a00bdde commit 6d7b44a
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 3 deletions.
4 changes: 4 additions & 0 deletions accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,7 @@ def get_op_builder(self, class_name):
@abc.abstractmethod
def build_extension(self):
...

@abc.abstractmethod
def export_envs(self):
...
3 changes: 3 additions & 0 deletions accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return []
3 changes: 3 additions & 0 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,3 +322,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return ['NCCL']
3 changes: 3 additions & 0 deletions accelerator/mps_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,6 @@ def build_extension(self):
from torch.utils.cpp_extension import BuildExtension

return BuildExtension

def export_envs(self):
return []
3 changes: 3 additions & 0 deletions accelerator/npu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH']
7 changes: 4 additions & 3 deletions deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from deepspeed.accelerator import get_accelerator

DLTS_HOSTFILE = "/job/hostfile"
EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
EXPORT_ENVS += NEBULA_EXPORT_ENVS
DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
Expand Down Expand Up @@ -544,9 +544,10 @@ def main(args=None):
# key exists in launcher env -> var list should be used
excluded_vars += var_list

exports = ""
# load envs from accelerator
exports = EXPORT_ENVS + get_accelerator().export_envs()
for var in env.keys():
if any([var.startswith(name) for name in EXPORT_ENVS]):
if any([var.startswith(name) for name in exports]):
if not any([var == name for name in excluded_vars]):
runner.add_export(var, env[var])

Expand Down

0 comments on commit 6d7b44a

Please sign in to comment.