From 059bb2085cf404caa5874004a252a56fc74c952e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B4nio=20Vieira?= Date: Mon, 29 Apr 2024 14:37:54 -0300 Subject: [PATCH 1/2] fix: swapping order of parameters in create_dir_symlink method. (#5465) Order of parameters in create_dir_symlink method looks wrong. Because this we get the error "PermissionError: [WinError 5] Denied access: '.\\deepspeed\\ops\\csrc'" when install deepspeed >= 0.4.0 on Windows enviroment. Please check this out @eltonzheng and @jeffra. --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f1367b850e02..839941b989c9 100755 --- a/setup.py +++ b/setup.py @@ -219,9 +219,9 @@ def create_dir_symlink(src, dest): if sys.platform == "win32": # This creates a symbolic links on Windows. # It needs Administrator privilege to create symlinks on Windows. - create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc') - create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder') - create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator') + create_dir_symlink('.\\deepspeed\\ops\\csrc', '..\\..\\csrc') + create_dir_symlink('.\\deepspeed\\ops\\op_builder', '..\\..\\op_builder') + create_dir_symlink('.\\deepspeed\\accelerator', '..\\accelerator') egg_info.manifest_maker.template = 'MANIFEST_win.in' # Parse the DeepSpeed version string from version.txt. From f32ad3e1c562be80bd8be6b7b2246dc3041b5bfd Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:39:12 -0700 Subject: [PATCH 2/2] Un-pin torch version in nv-torch-latest back to latest and skip test_compile_zero tests on v100 (#5459) Torch updating to 2.3.0 broke some test_compile_zero tests, we pinned it, @tohtana pushed fixes in #5463, this should un-pin and move us back to the latest. Failing test that indicates the generated code cannot run bf16 on V100 [here](https://github.com/microsoft/DeepSpeed/actions/runs/8838672379/job/24270349996?pr=5459#step:8:5157). --- .github/workflows/nv-torch-latest-v100.yml | 6 +++--- tests/unit/runtime/compile/test_compile_zero.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 2e0490c18ba7..3109f6060944 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -55,5 +55,5 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8" - pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8" + pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.3" --cuda_ver="11.8" + pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.3" --cuda_ver="11.8" diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index 9890ea708eec..a0736b0f5425 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -12,7 +12,7 @@ from unit.runtime.compile.util import compare_loss from unit.common import DistributedTest -from unit.util import bf16_required_version_check +from unit.util import bf16_required_version_check, skip_on_arch pytestmark = pytest.mark.skipif(not required_torch_version(min_version=2.1), reason="Compile tests requires Pytorch version 2.1 or above") @@ -26,9 +26,11 @@ class TestZeRO(DistributedTest): @pytest.mark.parametrize('zero_stage', [1, 2, 3]) @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme]) def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): + if dtype == torch.bfloat16: + skip_on_arch(min_arch=8) if dtype == torch.bfloat16 and not bf16_required_version_check(): pytest.skip( - " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" + "DeepSpeed BFloat16 tests need NCCL >= 2.10.3, CUDA >=11.0, and HW support for BFloat16 to run correctly" ) if get_accelerator().device_name() == "cpu": pytest.skip("CPU does not support this test yet")