From d272298b13ab4396a4f5f002480fd4937895b937 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Tue, 19 Mar 2024 11:56:59 +0200 Subject: [PATCH 1/4] Added get_compile_backend API to accelrators to avoid accelrator checks in compile tests. --- accelerator/abstract_accelerator.py | 4 ++++ accelerator/cpu_accelerator.py | 3 +++ accelerator/cuda_accelerator.py | 3 +++ accelerator/hpu_accelerator.py | 3 +++ accelerator/mps_accelerator.py | 3 +++ accelerator/npu_accelerator.py | 3 +++ accelerator/xpu_accelerator.py | 3 +++ tests/unit/runtime/compile/test_compile_wrapper.py | 4 +--- tests/unit/runtime/compile/test_compile_zero.py | 4 +--- tests/unit/runtime/compile/test_load_config.py | 4 +--- 10 files changed, 25 insertions(+), 9 deletions(-) diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 3c5d799e293e..bdf97b689083 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -287,3 +287,7 @@ def build_extension(self): @abc.abstractmethod def export_envs(self): ... + + @abc.abstractmethod + def get_compile_backend(self): + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 870d3e91816e..135b5a5c3c7e 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -320,3 +320,6 @@ def build_extension(self): def export_envs(self): return [] + + def get_compile_backend(self): + return "inductor" diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 3d5e9c168c16..4fbf6615992d 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -360,3 +360,6 @@ def build_extension(self): def export_envs(self): return ['NCCL'] + + def get_compile_backend(self): + return "inductor" diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 30b115e8b1ab..46b75e31f5c3 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -294,3 +294,6 @@ def build_extension(self): def export_envs(self): return [] + + def get_compile_backend(self): + return "hpu_backend" diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 972b33caece1..ae36e541f869 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -258,3 +258,6 @@ def build_extension(self): def export_envs(self): return [] + + def get_compile_backend(self): + return "inductor" diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 472157e32c02..4912fd05f8c4 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -278,3 +278,6 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] + + def get_compile_backend(self): + return "inductor" diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 3f65263946ab..56fb21175ad1 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -289,3 +289,6 @@ def build_extension(self): def export_envs(self): return [] + + def get_compile_backend(self): + return "inductor" diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index 0bebeed117b4..91302bb61dc7 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -31,11 +31,9 @@ def base_config(): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' return config_dict diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index 79ab5efd5099..db22c5280d3a 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -51,12 +51,10 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index f3c53ede91fd..35b61203e18a 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -47,12 +47,10 @@ def base_config(): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' return config_dict From 6d2828975e99d2357a6e3200b5c5685160702cdc Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Thu, 21 Mar 2024 11:28:42 +0200 Subject: [PATCH 2/4] Refactor get_compile_backend method for user defined backend handling. Refactored the get_compile_backend method to handle user backend selection more efficiently. If no backend is specified, it now defaults to accelerator preferred backend. Additionally, improved error handling for unsupported backends by providing informative error messages. --- accelerator/abstract_accelerator.py | 2 +- accelerator/cpu_accelerator.py | 11 +++++++++-- accelerator/cuda_accelerator.py | 11 +++++++++-- accelerator/hpu_accelerator.py | 11 +++++++++-- accelerator/mps_accelerator.py | 11 +++++++++-- accelerator/npu_accelerator.py | 11 +++++++++-- accelerator/xpu_accelerator.py | 11 +++++++++-- 7 files changed, 55 insertions(+), 13 deletions(-) diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index bdf97b689083..65fb2722c435 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -289,5 +289,5 @@ def export_envs(self): ... @abc.abstractmethod - def get_compile_backend(self): + def get_compile_backend(self, backend=None): ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 135b5a5c3c7e..61ed7511849c 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -321,5 +321,12 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self): - return "inductor" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "inductor" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 4fbf6615992d..4820799808bb 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -361,5 +361,12 @@ def build_extension(self): def export_envs(self): return ['NCCL'] - def get_compile_backend(self): - return "inductor" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "inductor" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 46b75e31f5c3..f840c1941e09 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -295,5 +295,12 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self): - return "hpu_backend" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "hpu_backend" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index ae36e541f869..63132737383f 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -259,5 +259,12 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self): - return "inductor" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "inductor" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 4912fd05f8c4..cf3b872185a5 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -279,5 +279,12 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] - def get_compile_backend(self): - return "inductor" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "inductor" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }") diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 56fb21175ad1..4cc08c4b327e 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -290,5 +290,12 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self): - return "inductor" + def get_compile_backend(self, backend=None): + supported_backends = torch._dynamo.list_backends() + if backend is None: + return "inductor" + elif backend in supported_backends: + return backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") From 057f1c390a8603b86c534e80015b04bd2b1a629c Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 24 Apr 2024 08:38:42 +0300 Subject: [PATCH 3/4] Add getter and setter methods for compile_backend across accelerators. --- accelerator/abstract_accelerator.py | 7 ++++++- accelerator/cpu_accelerator.py | 14 ++++++++------ accelerator/cuda_accelerator.py | 14 ++++++++------ accelerator/hpu_accelerator.py | 14 ++++++++------ accelerator/mps_accelerator.py | 14 ++++++++------ accelerator/npu_accelerator.py | 14 ++++++++------ accelerator/xpu_accelerator.py | 14 ++++++++------ 7 files changed, 54 insertions(+), 37 deletions(-) diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 65fb2722c435..8831bb00b252 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -12,6 +12,7 @@ class DeepSpeedAccelerator(ABC): def __init__(self): self._name = None self._communication_backend_name = None + self._compile_backend = None @abc.abstractmethod def is_synchronized_device(self): @@ -289,5 +290,9 @@ def export_envs(self): ... @abc.abstractmethod - def get_compile_backend(self, backend=None): + def get_compile_backend(self): + ... + + @abc.abstractmethod + def set_compile_backend(self, backend): ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 61ed7511849c..1f4f84bddf7e 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -20,6 +20,7 @@ class CPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cpu' + self._compile_backend = "inductor" if oneccl_imported_p: self._communication_backend_name = 'ccl' else: @@ -321,12 +322,13 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "inductor" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 4820799808bb..53c5edb4753e 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -25,6 +25,7 @@ class CUDA_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cuda' self._communication_backend_name = 'nccl' + self._compile_backend = "inductor" if pynvml is None: self._init_pynvml() @@ -361,12 +362,13 @@ def build_extension(self): def export_envs(self): return ['NCCL'] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "inductor" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index f840c1941e09..f2b8e810ad46 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -16,6 +16,7 @@ class HPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'hpu' self._communication_backend_name = 'hccl' + self._compile_backend = "hpu_backend" try: import habana_frameworks.torch.hpu as hpu hpu.setDeterministic(True) @@ -295,12 +296,13 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "hpu_backend" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 63132737383f..fe4f9ca6e905 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -20,6 +20,7 @@ class MPS_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = "mps" self._communication_backend_name = None + self._compile_backend = "inductor" def is_synchronized_device(self): return False @@ -259,12 +260,13 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "inductor" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index cf3b872185a5..d1081381a87a 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -20,6 +20,7 @@ def __init__(self): super().__init__() self._name = 'npu' self._communication_backend_name = 'hccl' + self._compile_backend = "inductor" # dict that holds class name <--> class type mapping i.e. # 'AsyncIOBuilder': # this dict will be filled at init stage @@ -279,12 +280,13 @@ def build_extension(self): def export_envs(self): return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH'] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "inductor" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }") diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 4cc08c4b327e..5c7e3a7ed502 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -14,6 +14,7 @@ class XPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'xpu' self._communication_backend_name = 'ccl' + self._compile_backend = "inductor" self.aligned_tensors = [] def is_synchronized_device(self): @@ -290,12 +291,13 @@ def build_extension(self): def export_envs(self): return [] - def get_compile_backend(self, backend=None): - supported_backends = torch._dynamo.list_backends() - if backend is None: - return "inductor" - elif backend in supported_backends: - return backend + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend else: raise ValueError( f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") From 9335035bb11fcdd460cb766b0175aa429a315881 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 24 Apr 2024 08:52:13 +0300 Subject: [PATCH 4/4] Fix pre-commit checks for accelrators. --- accelerator/abstract_accelerator.py | 2 +- accelerator/cpu_accelerator.py | 2 +- accelerator/cuda_accelerator.py | 2 +- accelerator/hpu_accelerator.py | 2 +- accelerator/mps_accelerator.py | 2 +- accelerator/npu_accelerator.py | 2 +- accelerator/xpu_accelerator.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 65e6e5979034..690f19292299 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -303,4 +303,4 @@ def get_compile_backend(self): @abc.abstractmethod def set_compile_backend(self, backend): - ... \ No newline at end of file + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index f95a92ca6be3..bd11d034f312 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -341,4 +341,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 4cca9b7ef65f..60d66b6cdbab 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -378,4 +378,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 04e89cb8a603..114f367e879d 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -312,4 +312,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 5bf478f50d0b..5fc9b1c8cfb6 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -278,4 +278,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 214ffd6f25ae..b0e0ff948e52 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -296,4 +296,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }") diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index a69dbd245260..9c4a9c903f96 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -307,4 +307,4 @@ def set_compile_backend(self, backend): self._compile_backend = backend else: raise ValueError( - f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") \ No newline at end of file + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")