Skip to content

Commit

Permalink
add device config env for the accelerator (#5396)
Browse files Browse the repository at this point in the history
Thank you for [pr](#5369) and
@delock contribution of ideas.
As mentioned in this
[pr](#5369), each device has
its own environmental variables.
We create visible_devices_envs() and set_visible_devices_envs() methods
on the accelerator class to enable each accelerator to implement env
settings within the interface , which is more generic to other
accelerators.

this commit has tested on npu, each one has 8 ascend npus

---------

Co-authored-by: yangcheng <[email protected]>
Co-authored-by: eigen2017 <[email protected]>
Co-authored-by: Logan Adams <[email protected]>
Co-authored-by: Olatunji Ruwase <[email protected]>
  • Loading branch information
5 people authored Apr 20, 2024
1 parent 99951ca commit 3f875d9
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 9 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,12 @@ dynamically link them at runtime.
## Contributed HW support
* DeepSpeed now support various HW accelerators.

| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated |
| ----------- | -------- | ---------------- | --------------------- | ------------------ |
| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes |
| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes |
| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes |
| Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated |
|-------------|-------------------------------------|------------------| --------------------- |--------------------|
| Huawei | Huawei Ascend NPU | npu | Yes | No |
| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes |
| Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes |
| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes |

## PyPI
We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
Expand Down
8 changes: 8 additions & 0 deletions accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,11 @@ def build_extension(self):
@abc.abstractmethod
def export_envs(self):
...

@abc.abstractmethod
def visible_devices_envs(self):
...

@abc.abstractmethod
def set_visible_devices_envs(self, current_env, local_accelerator_ids):
...
8 changes: 8 additions & 0 deletions accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,3 +322,11 @@ def build_extension(self):

def export_envs(self):
return []

# TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
def visible_devices_envs(self):
return ['CUDA_VISIBLE_DEVICES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
7 changes: 7 additions & 0 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,10 @@ def build_extension(self):

def export_envs(self):
return ['NCCL']

def visible_devices_envs(self):
return ['CUDA_VISIBLE_DEVICES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
7 changes: 7 additions & 0 deletions accelerator/hpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,10 @@ def build_extension(self):

def export_envs(self):
return []

def visible_devices_envs(self):
return ['HABANA_VISIBLE_MODULES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
9 changes: 9 additions & 0 deletions accelerator/mps_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,12 @@ def build_extension(self):

def export_envs(self):
return []

# TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
def visible_devices_envs(self):
# TODO: could not find visible devices env for mps
return ['CUDA_VISIBLE_DEVICES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
7 changes: 7 additions & 0 deletions accelerator/npu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,3 +278,10 @@ def build_extension(self):

def export_envs(self):
return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH']

def visible_devices_envs(self):
return ['ASCEND_RT_VISIBLE_DEVICES']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
7 changes: 7 additions & 0 deletions accelerator/xpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,10 @@ def build_extension(self):

def export_envs(self):
return []

def visible_devices_envs(self):
return ['ZE_AFFINITY_MASK']

def set_visible_devices_envs(self, current_env, local_accelerator_ids):
for env in self.visible_devices_envs():
current_env[env] = ",".join(map(str, local_accelerator_ids))
11 changes: 7 additions & 4 deletions deepspeed/launcher/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from collections import defaultdict
from typing import Dict
from argparse import ArgumentParser, REMAINDER
from deepspeed.accelerator import get_accelerator
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
from ..nebula.constants import DLTS_POD_ENV_PATH
from ..utils import logger, get_numactl_cmd
Expand Down Expand Up @@ -146,8 +147,8 @@ def main():
node_list = list(world_info.keys())
args.nnodes = len(node_list)
local_node = node_list[args.node_rank]
local_gpu_ids = world_info[local_node]
num_local_procs = len(local_gpu_ids)
local_accelerator_ids = world_info[local_node]
num_local_procs = len(local_accelerator_ids)
logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}")

global_rank_mapping = defaultdict(list)
Expand All @@ -161,8 +162,10 @@ def main():
curr_global_rank += 1
logger.info(f"global_rank_mapping={global_rank_mapping}")
logger.info(f"dist_world_size={dist_world_size}")
current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}")

get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids)
for env in get_accelerator().visible_devices_envs():
logger.info(f"Setting {env}={current_env[env]}")

# set PyTorch distributed related environmental variables
current_env["MASTER_ADDR"] = args.master_addr
Expand Down

0 comments on commit 3f875d9

Please sign in to comment.