diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index b1d94a4459d9..e8823f153fb8 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1360,7 +1360,7 @@ def reduce_ipg_grads(self): self.average_tensor(extra_large_grad_reduc.view(-1)) self.extra_large_param_to_reduce = None else: - self.average_tensor(self.ipg_buffer[self.ipg_index]) + self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket)) else: self.buffered_reduce_fallback(None, self.grads_in_ipg_bucket, diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index f3056a225a9b..4e203a71db60 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -653,8 +653,15 @@ def no_pool_bootstrap_stderr(f, xs, iters): setattr(lm, model_family, getattr(lm, model_family).half().to(device)) lm._device = device else: - lm = lm_eval.models.get_model(model_family).create_from_arg_string( - f"pretrained={model_name}", {"device": get_accelerator().device_name()}) + if get_accelerator().device_name() == 'hpu': + #lm_eval not supporting HPU device, so get model with CPU and move it to HPU. + lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}", + {"device": "cpu"}) + setattr(lm, model_family, getattr(lm, model_family).to(device)) + lm._device = device + else: + lm = lm_eval.models.get_model(model_family).create_from_arg_string( + f"pretrained={model_name}", {"device": get_accelerator().device_name()}) get_accelerator().synchronize() start = time.time() diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index 477b2fe2cc1b..0bebeed117b4 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -34,6 +34,8 @@ def base_config(): "backend": "inductor" } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index b3ab91dc4b4c..79ab5efd5099 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -55,6 +55,8 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index 2c0511c31480..f3c53ede91fd 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -50,6 +50,9 @@ def base_config(): "backend": "inductor" } } + + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index 71b49b7723b6..32ee262f3714 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -33,6 +33,9 @@ pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5", allow_module_level=True) +if get_accelerator().device_name() == 'hpu': + pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) class TestOneBitAdamBasic(DistributedTest): diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py index 92da2257bdb0..badd0bcee549 100644 --- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py +++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py @@ -4,9 +4,14 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest from unit.util import skip_on_arch +from deepspeed.accelerator import get_accelerator + +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) class Model(torch.nn.Module): diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py index 0689adc08670..6338a16b8dbb 100644 --- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py +++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py @@ -4,11 +4,15 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest - +from deepspeed.accelerator import get_accelerator import deepspeed.utils.groups as groups +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) + class Model(torch.nn.Module):