From d2ef20e75916fe91f75d4058856bd67490753818 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 13 Mar 2024 21:43:46 +0530 Subject: [PATCH 1/4] Skip 1Bit Compression and sparsegrad tests for HPU. (#5270) HPU accelrator does not support 1bit and sparsegrad tests. --- tests/unit/runtime/half_precision/onebit/test_onebit.py | 3 +++ .../sparse_tensor/test_averaging_sparse_gradients.py | 5 +++++ tests/unit/runtime/sparse_tensor/test_sparse_grads.py | 6 +++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index 71b49b7723b6..32ee262f3714 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -33,6 +33,9 @@ pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5", allow_module_level=True) +if get_accelerator().device_name() == 'hpu': + pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) class TestOneBitAdamBasic(DistributedTest): diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py index 92da2257bdb0..badd0bcee549 100644 --- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py +++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py @@ -4,9 +4,14 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest from unit.util import skip_on_arch +from deepspeed.accelerator import get_accelerator + +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) class Model(torch.nn.Module): diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py index 0689adc08670..6338a16b8dbb 100644 --- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py +++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py @@ -4,11 +4,15 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest - +from deepspeed.accelerator import get_accelerator import deepspeed.utils.groups as groups +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) + class Model(torch.nn.Module): From cf58c535df64b8e160c1a32862a7a7043ebdb558 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 13 Mar 2024 21:44:15 +0530 Subject: [PATCH 2/4] Enabled LMCorrectness inference tests on HPU. (#5271) Since lm_eval API(v0.3.0) does not currently support HPU accelerator, so to run LMCorrectness tests on HPU, get lm_eval model with CPU and move it to HPU accelerator. --- tests/unit/inference/test_inference.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index f3056a225a9b..4e203a71db60 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -653,8 +653,15 @@ def no_pool_bootstrap_stderr(f, xs, iters): setattr(lm, model_family, getattr(lm, model_family).half().to(device)) lm._device = device else: - lm = lm_eval.models.get_model(model_family).create_from_arg_string( - f"pretrained={model_name}", {"device": get_accelerator().device_name()}) + if get_accelerator().device_name() == 'hpu': + #lm_eval not supporting HPU device, so get model with CPU and move it to HPU. + lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}", + {"device": "cpu"}) + setattr(lm, model_family, getattr(lm, model_family).to(device)) + lm._device = device + else: + lm = lm_eval.models.get_model(model_family).create_from_arg_string( + f"pretrained={model_name}", {"device": get_accelerator().device_name()}) get_accelerator().synchronize() start = time.time() From a6fb4d3e237627dc2ab399f39294b051ae454e96 Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 13 Mar 2024 21:55:13 +0530 Subject: [PATCH 3/4] Added HPU backend support for torch.compile tests. (#5269) Added HPU backend in torch.compile tests. HPU uses hpu_backend for torch.compile. --- tests/unit/runtime/compile/test_compile_wrapper.py | 2 ++ tests/unit/runtime/compile/test_compile_zero.py | 2 ++ tests/unit/runtime/compile/test_load_config.py | 3 +++ 3 files changed, 7 insertions(+) diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index 477b2fe2cc1b..0bebeed117b4 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -34,6 +34,8 @@ def base_config(): "backend": "inductor" } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index b3ab91dc4b4c..79ab5efd5099 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -55,6 +55,8 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index 2c0511c31480..f3c53ede91fd 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -50,6 +50,9 @@ def base_config(): "backend": "inductor" } } + + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict From 75e579e70f19de3f92af28cca24b7361c8b8567c Mon Sep 17 00:00:00 2001 From: BacharL Date: Wed, 13 Mar 2024 22:37:42 +0200 Subject: [PATCH 4/4] Average only valid part of the ipg buffer. (#5268) When contiguous gradients is used ipg buffer may not be fully utilized. Call average_tensor only for the slice with valid gradints Change-Id: I760559d52c2f91e15cd6cd0b48e534ec2352802a Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/zero/stage_1_and_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index b1d94a4459d9..e8823f153fb8 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1360,7 +1360,7 @@ def reduce_ipg_grads(self): self.average_tensor(extra_large_grad_reduc.view(-1)) self.extra_large_param_to_reduce = None else: - self.average_tensor(self.ipg_buffer[self.ipg_index]) + self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket)) else: self.buffered_reduce_fallback(None, self.grads_in_ipg_bucket,