Merge branch 'master' into duli/capability

microsoft · Apr 16, 2024 · 0703d79 · 0703d79
2 parents 04611e4 + 0896503
commit 0703d79
Show file tree

Hide file tree

Showing 131 changed files with 3,465 additions and 947 deletions.
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -21,7 +21,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -21,7 +21,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: environment
         run: |

diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
@@ -7,6 +7,24 @@ on:
   pull_request:
     paths:
       - ".github/workflows/hpu-gaudi2.yml"
+      - "accelerator/hpu_accelerator.py"
+      - "op_builder/hpu/**"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "deepspeed/inference/engine.py"
+      - "deepspeed/module_inject/auto_tp.py"
+      - "deepspeed/module_inject/replace_module.py"
+      - "deepspeed/module_inject/load_checkpoint.py"
+      - "deepspeed/module_inject/inject.py"
+      - "deepspeed/ops/transformer/**"
+      - "deepspeed/ops/adam/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -21,15 +39,67 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+      image: vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
 
+    env:
+      PT_HPU_LAZY_MODE: 0
+      TEST_LIST: |
+        test_accelerator.py
+        test_autotuning.py
+        test_compression.py
+        test_dist.py
+        test_elastic.py
+        (test_intX_quantization.py and test_quantized_linear)
+        test_ds_arguments.py
+        test_run.py
+        test_multinode_runner.py
+        test_moe_tp.py
+        test_monitor.py
+        (test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed))
+        (test_latest_checkpoint.py and test_missing_latest)
+        test_reshape_checkpoint.py
+        test_shared_weights.py
+        test_sparse.py
+        test_tag_validation.py
+        test_pipe_module.py
+        (test_flops_profiler.py and test_flops_profiler_in_inference)
+        test_get_optim_files.py
+        test_groups.py
+        test_init_on_device.py
+        test_partition_balanced.py
+        (test_adamw.py and TestAdamConfigs)
+        test_coalesced_collectives.py
+        test_activation_checkpointing_non_reentrant.py
+        test_activation_checkpointing.py
+        test_data.py
+        (test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig))
+        test_ds_config_model.py
+        test_mup_optimizers.py
+        (test_pld.py and test_pld_schedule)
+        test_runtime_utils.py
+        test_pipe_schedule.py
+        test_topology.py
+        (test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler))
+        test_csr.py
+        (test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer))
+        (test_bf16.py and TestZeroDtypeCocktail)
+        test_partition.py
+        test_ignore_unused_parameters.py
+        test_zero_config.py
+        test_zero_context_ancestry.py
+        (test_zero_context.py and not TestSerialContext)
+        test_zero_dynamic_class.py
+        test_zero_nesting_init.py
+        test_zeropp.py
+        (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
+
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
@@ -38,11 +108,28 @@ jobs:
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip install .
+
       - name: Install deepspeed
         run: |
-          pip install .[dev]
+          pip install .[dev,autotuning]
           ds_report
 
       - name: Python environment
         run: |
           pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
+          echo "TEST_LIST ${TEST_LIST}"
+          echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}"
+          pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -29,7 +29,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -19,10 +19,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
@@ -21,10 +21,10 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
@@ -23,7 +23,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |

diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -22,10 +22,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
@@ -27,10 +27,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
@@ -15,17 +15,17 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu117
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -55,7 +55,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.7"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -26,7 +26,7 @@ jobs:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 
     steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
 
         - name: environment
           run: |
@@ -36,7 +36,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0  DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
         - name: DS Report
           run: |
              ds_report
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
@@ -33,7 +33,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -19,10 +19,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
@@ -15,10 +15,10 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
@@ -18,10 +18,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu117, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -11,7 +11,7 @@ jobs:
     environment: release-env
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         ref: "master"
     - id: setup-venv