diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index e4b938d8e078..00ff72ac8929 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -21,7 +21,7 @@ jobs: # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 3e09d3cc1e49..38dd9bd3efef 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -47,42 +47,26 @@ jobs: - name: Detect instruction sets on instance run: | lscpu - pip install cmake - git clone https://github.com/intel/intel-extension-for-pytorch - cd intel-extension-for-pytorch/tests/cpu/isa - cmake . - make - ./cpu_features - name: Install numactl run: | sudo apt-get install -y numactl - - name: Install oneCCL Bindings for PyTorch + - name: Install dependencies run: | pip install torch - python -m pip install intel_extension_for_pytorch - # the curl line is for troubleshooting - curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ - python -m pip install oneccl_bind_pt --index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ - pip install py-cpuinfo # check installed version pip list |grep \\\ - pip list |grep intel-extension-for-pytorch - pip list |grep oneccl-bind-pt - name: Install oneCCL run: | + pip install cmake git clone https://github.com/oneapi-src/oneCCL cd oneCCL mkdir build cd build cmake .. - make - make install - #source ./_install/env/setvars.sh - # test whether oneCCL is correctly installed - #mpirun -n 2 ./examples/benchmark/benchmark + make -j install - name: Install transformers run: | @@ -103,7 +87,6 @@ jobs: source oneCCL/build/_install/env/setvars.sh export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 # check whether the environment is properly setup - python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')" python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())" - name: Unit tests diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index ba4906db15c9..9c1ad02f75a6 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -22,11 +22,15 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv + - name: Install system packages + run: | + sudo apt-get install -y numactl pdsh + - name: Install pytorch run: | pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index 88dfa34a0a2b..d2554b7c0038 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: environment run: | diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml new file mode 100644 index 000000000000..a3c12d057cc9 --- /dev/null +++ b/.github/workflows/hpu-gaudi2.yml @@ -0,0 +1,135 @@ +name: hpu-gaudi2 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/hpu-gaudi2.yml" + - "accelerator/hpu_accelerator.py" + - "op_builder/hpu/**" + - "deepspeed/runtime/engine.py" + - "deepspeed/runtime/bf16_optimizer.py" + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" + - "deepspeed/runtime/zero/partition_parameters.py" + - "deepspeed/runtime/zero/partitioned_param_coordinator.py" + - "deepspeed/runtime/zero/parameter_offload.py" + - "deepspeed/runtime/pipe/engine.py" + - "deepspeed/runtime/utils.py" + - "deepspeed/inference/engine.py" + - "deepspeed/module_inject/auto_tp.py" + - "deepspeed/module_inject/replace_module.py" + - "deepspeed/module_inject/load_checkpoint.py" + - "deepspeed/module_inject/inject.py" + - "deepspeed/ops/transformer/**" + - "deepspeed/ops/adam/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + # The type of runner that the job will run on + runs-on: [self-hosted, intel, gaudi2] + container: + image: vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + ports: + - 80 + options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice + + env: + PT_HPU_LAZY_MODE: 0 + TEST_LIST: | + test_accelerator.py + test_autotuning.py + test_compression.py + test_dist.py + test_elastic.py + (test_intX_quantization.py and test_quantized_linear) + test_ds_arguments.py + test_run.py + test_multinode_runner.py + test_moe_tp.py + test_monitor.py + (test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed)) + (test_latest_checkpoint.py and test_missing_latest) + test_reshape_checkpoint.py + test_shared_weights.py + test_sparse.py + test_tag_validation.py + test_pipe_module.py + (test_flops_profiler.py and test_flops_profiler_in_inference) + test_get_optim_files.py + test_groups.py + test_init_on_device.py + test_partition_balanced.py + (test_adamw.py and TestAdamConfigs) + test_coalesced_collectives.py + test_activation_checkpointing_non_reentrant.py + test_activation_checkpointing.py + test_data.py + (test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig)) + test_ds_config_model.py + test_mup_optimizers.py + (test_pld.py and test_pld_schedule) + test_runtime_utils.py + test_pipe_schedule.py + test_topology.py + (test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler)) + test_csr.py + (test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer)) + (test_bf16.py and TestZeroDtypeCocktail) + test_partition.py + test_ignore_unused_parameters.py + test_zero_config.py + test_zero_context_ancestry.py + (test_zero_context.py and not TestSerialContext) + test_zero_dynamic_class.py + test_zero_nesting_init.py + test_zeropp.py + (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam)) + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v4 + + - name: Check container state + run: | + ldd --version + hl-smi + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + + - name: Install transformers + run: | + git clone https://github.com/huggingface/transformers + cd transformers + git rev-parse --short HEAD + pip install . + + - name: Install deepspeed + run: | + pip install .[dev,autotuning] + ds_report + + - name: Python environment + run: | + pip list + + - name: Unit tests + run: | + unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + cd tests + export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE} + TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}') + echo "TEST_LIST ${TEST_LIST}" + echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}" + pytest --verbose unit/ -k "${TEST_LIST}" diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 960e0203919e..3ce406948432 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -29,7 +29,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 93286b62610a..915493bb3183 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -19,10 +19,10 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 61011a85b92c..94571eb101bb 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -21,10 +21,10 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml index 93f074787372..5574ce8aa634 100644 --- a/.github/workflows/nv-h100.yml +++ b/.github/workflows/nv-h100.yml @@ -23,7 +23,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index a24376c8973d..f863226bfb95 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -22,10 +22,10 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv @@ -46,7 +46,8 @@ jobs: - name: Install deepspeed run: | - pip install .[dev,1bit,autotuning,inf,triton] + DS_ACCELERATOR=cpu pip install .[dev,1bit,autotuning,inf] + #pip install .[dev,1bit,autotuning,inf,triton] ds_report - name: Python environment @@ -60,3 +61,5 @@ jobs: #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="11.8" pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="11.8" pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="11.8" + # run ds_report again to check updated op list + ds_report diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index 0b3f128be5a4..8452c138c717 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -27,10 +27,10 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index e540b5acaf33..b1e8c042214f 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -15,17 +15,17 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu116 + pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu117 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -55,7 +55,7 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6" + pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.7" - name: Open GitHub issue if nightly CI fails if: ${{ failure() && (github.event_name == 'schedule') }} diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index 6440de1a81ba..6afc11fddaab 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -26,7 +26,7 @@ jobs: image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: environment run: | @@ -36,7 +36,7 @@ jobs: #python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Compile DeepSpeed Ops run: | - DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install . + DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install . - name: DS Report run: | ds_report diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml index 0af9517c5b59..b348d5ff931f 100644 --- a/.github/workflows/nv-sd.yml +++ b/.github/workflows/nv-sd.yml @@ -33,7 +33,7 @@ jobs: options: --gpus all --shm-size "8G" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check container state run: | diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index e2d0f172dcbf..3ca8ac43dfa4 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -19,10 +19,10 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index f46c5089b241..257040439005 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -15,10 +15,10 @@ permissions: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index 4fbc42abec5f..cfed6d6583e6 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -18,10 +18,10 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu116, v100] + runs-on: [self-hosted, nvidia, cu117, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8e016b4169cb..5a931125eff6 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -11,7 +11,7 @@ jobs: environment: release-env steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: "master" - id: setup-venv diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml new file mode 100644 index 000000000000..f83c58dc7ff5 --- /dev/null +++ b/.github/workflows/xpu-max1100.yml @@ -0,0 +1,74 @@ +name: xpu-max1100 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/xpu-max1100.yml" + - "accelerator/xpu_accelerator.py" + - "accelerator/abstract_accelerator.py" + - "accelerator/cpu_accelerator.py" + - "accelerator/real_accelerator.py" + - "deepspeed/runtime/engine.py" + - "deepspeed/runtime/bf16_optimizer.py" + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" + - "deepspeed/runtime/zero/partition_parameters.py" + - "deepspeed/runtime/zero/partitioned_param_coordinator.py" + - "deepspeed/runtime/zero/parameter_offload.py" + - "deepspeed/runtime/pipe/engine.py" + - "deepspeed/runtime/utils.py" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + + +jobs: + unit-tests: + runs-on: [self-hosted, intel, xpu] + container: + image: intel/intel-extension-for-pytorch:2.1.20-xpu + ports: + - 80 + options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL + + steps: + - uses: actions/checkout@v4 + - name: Check container state + shell: bash + run: | + ldd --version + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())" + + - name: Install deepspeed + run: | + pip install py-cpuinfo + pip install .[dev,autotuning] + ds_report + python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)" + + - name: Python environment + run: | + pip list + + - name: Unit tests + run: | + pip install pytest pytest-timeout tabulate + cd tests/unit + pytest --verbose accelerator/* + pytest --verbose autotuning/* + pytest --verbose checkpoint/test_reshape_checkpoint.py + pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py + pytest --verbose runtime/test_ds_config_model.py + pytest --verbose runtime/pipe/test_pipe_schedule.py + pytest --verbose runtime/zero/test_zero_config.py + pytest --verbose runtime/zero/test_zero_tiled.py + pytest --verbose runtime/zero/test_zeropp.py diff --git a/README.md b/README.md index c8b30eb104c6..a1335caa4949 100755 --- a/README.md +++ b/README.md @@ -131,7 +131,9 @@ DeepSpeed has been integrated with several different popular open-source DL fram | ----------- | ------ | | NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) | | AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) | -| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) | +| CPU | [![torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-torch-latest.yml) [![cpu-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/cpu-inference.yml) | +| Intel Gaudi | [![hpu-gaudi2](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/hpu-gaudi2.yml) | +| Intel XPU | [![xpu-max1100](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/xpu-max1100.yml) | | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | @@ -159,8 +161,9 @@ dynamically link them at runtime. | Contributor | Hardware | Accelerator Name | Contributor validated | Upstream validated | | ----------- | -------- | ---------------- | --------------------- | ------------------ | +| Intel | Intel(R) Gaudi(R) 2 AI accelerator | hpu | Yes | Yes | | Intel | Intel(R) Xeon(R) Processors | cpu | Yes | Yes | -| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | No | +| Intel | Intel(R) Data Center GPU Max series | xpu | Yes | Yes | ## PyPI We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases. diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index b1aba75b4c5e..870d3e91816e 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -4,9 +4,14 @@ # DeepSpeed Team import torch -from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator -import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore -import psutil +from .abstract_accelerator import DeepSpeedAccelerator + +try: + import oneccl_bindings_for_pytorch # noqa: F401 # type: ignore + oneccl_imported_p = True +except ImportError as e: + oneccl_imported_p = False + import os @@ -15,8 +20,17 @@ class CPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cpu' - self._communication_backend_name = 'ccl' - self.max_mem = psutil.Process().memory_info().rss + if oneccl_imported_p: + self._communication_backend_name = 'ccl' + else: + # fallback to gloo if oneccl_binding_for_pytorch is not installed + self._communication_backend_name = 'gloo' + try: + import psutil + mem = psutil.Process().memory_info().rss + self.max_mem = mem + except ImportError as e: + self.max_mem = 0 def is_synchronized_device(self): return True @@ -115,12 +129,14 @@ def empty_cache(self): return def get_rss(self): + import psutil mem = psutil.Process().memory_info().rss if mem > self.max_mem: self.max_mem = mem return mem def reset_rss(self): + import psutil mem = psutil.Process().memory_info().rss self.max_mem = mem return mem @@ -166,9 +182,11 @@ def max_memory_reserved(self, device_index=None): return self.max_mem def total_memory(self, device_index=None): + import psutil return psutil.virtual_memory().total def available_memory(self, device_index=None): + import psutil return psutil.virtual_memory().available # Misc diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py index 1090a61681d9..037162e867ec 100644 --- a/accelerator/real_accelerator.py +++ b/accelerator/real_accelerator.py @@ -73,11 +73,7 @@ def get_accelerator(): f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system." ) elif accelerator_name == "cpu": - try: - import intel_extension_for_pytorch # noqa: F401 # type: ignore - except ImportError as e: - raise ValueError( - f"CPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.") + pass elif accelerator_name == "npu": try: import torch_npu # noqa: F401 # type: ignore @@ -154,7 +150,23 @@ def get_accelerator(): except ImportError as e: pass if accelerator_name is None: - accelerator_name = "cuda" + # borrow this log from PR#5084 + try: + import torch + + # Determine if we are on a GPU or x86 CPU with torch. + if torch.cuda.is_available(): #ignore-cuda + accelerator_name = "cuda" + else: + if accel_logger is not None: + accel_logger.warn( + "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it." + ) + accelerator_name = "cpu" + except (RuntimeError, ImportError) as e: + # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection + accelerator_name = "cuda" + pass ds_set_method = "auto detect" diff --git a/blogs/deepspeed-fp6/03-05-2024/README.md b/blogs/deepspeed-fp6/03-05-2024/README.md index dbd6b2d081aa..0285dd79b87d 100755 --- a/blogs/deepspeed-fp6/03-05-2024/README.md +++ b/blogs/deepspeed-fp6/03-05-2024/README.md @@ -43,7 +43,7 @@ To cite DeepSpeed-FP6, please cite the following two arxiv reports - ZeroQuant(4 In the evolving landscape of Large Language Models (LLMs) like GPT, our research aims to boost computational efficiency and storage while preserving model quality. This focus brings us to tackle the complex challenges of 4-bit quantization, where optimizing performance, efficiency, and accuracy is crucial. -**Exploring the Challenges of 4-bit Quantization** In our recent research findings -- ZeroQuant (4+2)[1], we explore the capabilities of INT4 quantization techniques (like the GPTQ algorithm) for serving Large Language Models (LLMs). While these techniques reduce memory and computational requirements, they often perform poorly on a broad array of tasks, including generative tasks such as code generation and summarization, due to overfitting issues. This highlights the urgent need for new quantization approaches that simultanenously improve both the efficiency and effectiveness of LLMs. +**Exploring the Challenges of 4-bit Quantization** In our recent research findings -- ZeroQuant (4+2)[1], we explore the capabilities of INT4 quantization techniques (like the GPTQ algorithm) for serving Large Language Models (LLMs). While these techniques reduce memory and computational requirements, they often perform poorly on a broad array of tasks, including generative tasks such as code generation and summarization, due to overfitting issues. This highlights the urgent need for new quantization approaches that simultaneously improve both the efficiency and effectiveness of LLMs. **Breakthroughs with FP6 Precision** Our exploration of different quantization methods led us to the FP6 precision standard. Despite the challenges in integrating and accelerating FP6 with current AI hardware -- which we will address in the next section - this format excels in performance and flexibility across various tasks. Notably, we observe that for generative tasks, FP6 quantization can match the performance of the half-precision (FP16) format. For example, with FP6 quantization, StarCoder-15B achieves comparable code generation results to the FP16 variant, while a smaller model, such as BART-460M, achieves comparable summarization performance to the standard FP16 equivalent. In order to preserve these quality gains, while matching the system efficiency of INT4 quantization on AI hardware, we propose a novel 4+2 FP6 scheme. This innovation makes FP6 a promising direction for improving the efficiency of LLMs, marking a significant leap in AI technology advancement. For more details, please refer to our research paper - ZeroQuant (4+2)[1]. diff --git a/blogs/deepspeed-ulysses/README.md b/blogs/deepspeed-ulysses/README.md index aa4416521dd1..375eb1190325 100644 --- a/blogs/deepspeed-ulysses/README.md +++ b/blogs/deepspeed-ulysses/README.md @@ -233,7 +233,7 @@ at different sequence length and GPU count.* Next, we evaluate Ulysses on 7 billion (7B) and 30 billion (30B) parameter GPT dense attention models and compare against Megatron-LM's sequence -parallelism (Megatron LM) and Colosal AI sequence parallelism (ColAI-SP) on +parallelism (Megatron LM) and Colossal AI sequence parallelism (ColAI-SP) on 32 and 64 A100 GPUs respectively. The results of these evaluations are shown in Figures 3 and 4. diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 6428ab5cbfa5..786906717f23 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -5,281 +5,24 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include "shm.h" -// states for collectives -enum coll_state { - coll_begin = 0, - // coll states for naive allreduce - coll_allreduce_naive__copy_in_done, // this state is for rank != 0 - coll_allreduce_naive__reduce_done, // this state is for rank == 0 - coll_allreduce_naive__copy_out_done, // this state is for rank != 0 -}; - -// SHM building blocks -struct SharedData { - const char* name; - int descriptor; - void* bytes; - size_t nbytes; -}; - -void shared_open(SharedData* data, const char* name, size_t nbytes) -{ - int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); - if (d != -1) { - void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0); - data->name = name; - data->descriptor = d; - data->bytes = bytes; - data->nbytes = nbytes; - } else { - printf("shared_open %s failed\n", name); - data->descriptor = -1; - } -} - -void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes) -{ - int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); - if (d != -1) { - if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); } - } else { - printf("shared_create %s failed\n", name); - } -} - -void shared_close(SharedData* data) -{ - if (data->descriptor != -1) { - munmap(data->bytes, data->nbytes); - shm_unlink(data->name); - } -} - -// SHM based allreduce helper functions -// buffer that holds shm name -#define NAME_BUF_SIZE 1000 -#define MAX_BUF_SIZE 1048576 -#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer" -SharedData allreduce_buffer; -struct allreduce_workspace { - enum coll_state state; - char buffer[MAX_BUF_SIZE]; -}; -struct allreduce_workspace* workspace; - -void wait_buffer_state_until(int index, enum coll_state state) -{ - volatile enum coll_state* state_ptr = &(workspace[index].state); - - while (*state_ptr != state) - ; -} - -void wait_buffer_state_until_not(int index, enum coll_state state) -{ - volatile enum coll_state* state_ptr = &(workspace[index].state); - - while (*state_ptr == state) - ; -} - -__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw"))); -inline __m512 cvt_bf16_to_fp32(const __m256i src) -{ - auto y = _mm512_cvtepu16_epi32(src); - return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2)); -} - -inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); -inline __m256i cvt_fp32_to_bf16(const __m512 src) -{ - __m512i value = _mm512_castps_si512(src); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - // uint32_t lsb = (input >> 16) & 1; - auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); - // uint32_t rounding_bias = 0x7fff + lsb; - t_value = _mm512_add_epi32(t_value, vec_bias); - // input += rounding_bias; - t_value = _mm512_add_epi32(t_value, value); - // input = input >> 16; - t_value = _mm512_srli_epi32(t_value, 16); - // Check NaN before converting back to bf16 - t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); - return _mm512_cvtusepi32_epi16(t_value); -} - -void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in) - __attribute__((target("avx512bw"))); - -void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) - __attribute__((target("avx512bw"))); - -void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in) - __attribute__((target("avx512bw"))); - -void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) - __attribute__((target("avx512bw"))); - -// N_REDUCE_LIMIT is the number of buffers that can be reduced together in one shot. -// Compared with do N-1 2-reduces which needs 2*(N-1) read and N-1 write, -// N-reduce only needs N read and 1 write, this saves 2/3 memory bandwidth. -// When increase N_REDUCE_LIMIT to a bigger number, do the following steps -// 1. Extend REPEAT_ macros list down below -// 2. Extend switch cases which call "REPEAT(X, ...)" down below -#define N_REDUCE_LIMIT 8 - -void reduce_all_buffers(struct allreduce_workspace* workspace, - int num_elements, - c10::ScalarType scalar_type, - int num_buffers) -{ - switch (scalar_type) { - case c10::ScalarType::BFloat16: - if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) { - reduce_bf16_buffers(num_elements, num_buffers, workspace); - } else { - for (int i = 1; i < num_buffers; i++) { - reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); - } - } - break; - case c10::ScalarType::Float: - if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) { - reduce_fp32_buffers(num_elements, num_buffers, workspace); - } else { - for (int i = 1; i < num_buffers; i++) { - reduce_2_fp32_buffers(num_elements, workspace[0].buffer, workspace[i].buffer); - } - } - break; - default: assert(!"Should not get here"); - } -} +// #define DO_PROFILE +#ifdef DO_PROFILE +#include +#include +#endif -#define REPEAT(N, x) REPEAT_##N(x) -#define REPEAT_1(x) x(1) -#define REPEAT_2(x) \ - REPEAT_1(x); \ - x(2) -#define REPEAT_3(x) \ - REPEAT_2(x); \ - x(3) -#define REPEAT_4(x) \ - REPEAT_3(x); \ - x(4) -#define REPEAT_5(x) \ - REPEAT_4(x); \ - x(5) -#define REPEAT_6(x) \ - REPEAT_5(x); \ - x(6) -#define REPEAT_7(x) \ - REPEAT_6(x); \ - x(7) - -#define CVT_ADD_BF16(x) \ - do { \ - auto in##x##_val = \ - cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \ - inout_val = _mm512_add_ps(inout_val, in##x##_val); \ - } while (0) - -// Reduce functions down below use vectorized algorithm, the number of bytes processed each -// iteration depends on vector length. 256bit vector ==> 32 bytes, 512bit vector ==> 64 bytes -// If you change implementation of reduce_2_bf16_buffers or reduce_2_fp32_buffers, check -// whether this number needs to be changed -#define VECTOR_LENGTH_IN_BYTES 32 - -// num_elements must be divisible by 16 (caller check) -void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) -{ -#pragma omp parallel for - for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i))); - switch (num_buffers) { - case 8: REPEAT(7, CVT_ADD_BF16); break; - case 7: REPEAT(6, CVT_ADD_BF16); break; - case 6: REPEAT(5, CVT_ADD_BF16); break; - case 5: REPEAT(4, CVT_ADD_BF16); break; - case 4: REPEAT(3, CVT_ADD_BF16); break; - case 3: REPEAT(2, CVT_ADD_BF16); break; - default: assert(!"Should not get here."); - } - _mm256_storeu_si256((__m256i*)(workspace[0].buffer + i), cvt_fp32_to_bf16(inout_val)); - } -} - -void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1) -{ -#pragma omp parallel for - for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) { - auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i))); - auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i))); - inout_val = _mm512_add_ps(inout_val, in1_val); - _mm256_storeu_si256((__m256i*)((char*)in_out + i), cvt_fp32_to_bf16(inout_val)); - } -} - -#define CVT_ADD_F32(x) \ - do { \ - auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x].buffer + i)); \ - inout_val = _mm256_add_ps(inout_val, in##x##_val); \ - } while (0) - -// num_elements must be divisible by 16 (caller check) -void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace) -{ -#pragma omp parallel for - for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) { - auto inout_val = _mm256_loadu_ps((float*)(workspace[0].buffer + i)); - switch (num_buffers) { - case 8: REPEAT(7, CVT_ADD_F32); break; - case 7: REPEAT(6, CVT_ADD_F32); break; - case 6: REPEAT(5, CVT_ADD_F32); break; - case 5: REPEAT(4, CVT_ADD_F32); break; - case 4: REPEAT(3, CVT_ADD_F32); break; - case 3: REPEAT(2, CVT_ADD_F32); break; - default: assert(!"Should not get here."); - } - _mm256_storeu_ps((float*)(workspace[0].buffer + i), inout_val); - } -} +// Communication settings +static int world_rank = -1; +static int world_size = -1; -void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in1) -{ -#pragma omp parallel for - for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) { - auto inout_val = _mm256_loadu_ps((float*)((char*)in_out + i)); - auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i)); - inout_val = _mm256_add_ps(inout_val, in1_val); - _mm256_storeu_ps((float*)((char*)in_out + i), inout_val); - } -} - -// Communicatiooon settings -int world_rank = -1; -int world_size = -1; - -std::set _comm_ids; -std::set _colors; -std::vector _ccl_comms; -ccl::shared_ptr_class sub_kvs; -std::map, int> group_to_comm_id; +static std::set _comm_ids; +static std::set _colors; +static std::vector _ccl_comms; +static ccl::shared_ptr_class sub_kvs; +static std::map, int> group_to_comm_id; ccl::communicator& _get_comm_from_group() { return _ccl_comms[0]; } ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0]; } @@ -300,11 +43,11 @@ ccl::communicator& _get_comm_from_group(std::vector ranks) #define KVS_CREATE_SUCCESS 0 #define KVS_CREATE_FAILURE -1 -bool is_initialized = 0; +static bool is_initialized = 0; -ccl::shared_ptr_class kvs; +static ccl::shared_ptr_class kvs; -bool all_ranks_local_p = false; +static bool all_ranks_local_p = false; void initialize(int size, int rank, torch::Tensor& kvs_data) { @@ -336,30 +79,8 @@ void initialize(int size, int rank, torch::Tensor& kvs_data) if (addr_string == NULL) { addr_string = ""; } auto port_string = std::getenv("MASTER_PORT"); if (port_string == NULL) { port_string = ""; } - char shm_name[NAME_BUF_SIZE]; - snprintf(shm_name, - NAME_BUF_SIZE, - "%s_%d_%s_%s", - SHM_BUFFER_NAME, - getuid(), - addr_string, - port_string); - // create shared workspace for SHM based allreduce - if (all_ranks_local_p) { - if (rank == 0) { - workspace = - (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace)); - shared_create( - &allreduce_buffer, shm_name, workspace, size * sizeof(struct allreduce_workspace)); - workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; - for (int i = 0; i < size; i++) { workspace[i].state = coll_begin; } - } - CCLCHECK(ccl::barrier(_get_comm_from_group()).wait()); - if (rank != 0) { - shared_open(&allreduce_buffer, shm_name, size * sizeof(struct allreduce_workspace)); - } - workspace = (struct allreduce_workspace*)allreduce_buffer.bytes; - } + + if (all_ranks_local_p) { shm_initialize(size, rank, addr_string, port_string); } } /* @@ -526,19 +247,22 @@ void all_reduce_caching(torch::Tensor& data, .wait()); } -static void parallel_memcpy(void* to, void* from, size_t n_bytes) - __attribute__((target("avx512bw"))); -static void parallel_memcpy(void* to, void* from, size_t n_bytes) -{ -#pragma omp parallel for - for (int i = 0; i < n_bytes; i += VECTOR_LENGTH_IN_BYTES) { - auto val = _mm256_loadu_si256((__m256i*)((char*)from + i)); - _mm256_storeu_si256((__m256i*)((char*)to + i), val); - } -} - void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) { +#ifdef DO_PROFILE + static double total_time = 0.0; + static double total_time_sq = 0.0; + static int count = -16; // warmup + static double max_time = 0.0; + static double min_time = DBL_MAX; + // make sure all rank reach this point before measuring time + // turn on this if you suspect each rank didn't reach here at the same time (stragger) + // if (all_ranks_local_p) { + // barrier_wait(0, world_size); + //} + auto start = std::chrono::system_clock::now(); +#endif + static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value")); @@ -555,7 +279,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) default: data_type_fallback = true; } - if (data_type_fallback || (data_size % VECTOR_LENGTH_IN_BYTES) != 0 || !all_ranks_local_p) { + if (data_type_fallback || !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), @@ -564,50 +288,31 @@ void inference_all_reduce(torch::Tensor& data, py::object op, bool async_op) get_ccl_reduce_op(op, data), _get_comm_from_group()) .wait()); - return; + } else { + all_reduce_outer_loop(data, numel, data_size); } - for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) { - auto data_ptr = ((char*)(data.data_ptr()) + offset); - size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset; - size_t chunk_el = chunk_size / (data_size / numel); - - parallel_memcpy(workspace[world_rank].buffer, data_ptr, chunk_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__copy_in_done; - - if (world_rank == 0) { - // compute allreduce result on rank 0 - for (int i = 1; i < world_size; i++) { - // wait until the other rank copy the buffer - wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done); - } - reduce_all_buffers(workspace, chunk_el, data.scalar_type(), world_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__reduce_done; - parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size); - } - if (world_rank != 0) { - wait_buffer_state_until(0, coll_allreduce_naive__reduce_done); - parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__copy_out_done; - } - if (world_rank == 0) { - for (int i = 1; i < world_size; i++) { - wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done); - } - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_begin; - } - if (world_rank != 0) { - // if rank 0 spin too fast it could be in state 1 of next allreduce - // in this case wait_buffer_state_until(0, 0) may cause deadlock - // what we are certain is when rank 0 finishes the state won't be 2 - wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done); - workspace[world_rank].state = coll_begin; +#ifdef DO_PROFILE + auto end = std::chrono::system_clock::now(); + count++; + if (count > 0) { + double elapsed = std::chrono::duration_cast(end - start).count(); + if (elapsed > max_time) { max_time = elapsed; } + if (elapsed < min_time) { min_time = elapsed; } + total_time += elapsed; + total_time_sq += elapsed * elapsed; + if (world_rank == 0 && count == 1000) { + auto avg = total_time / count; + auto sd = + sqrt(total_time_sq / count - total_time * total_time / (count * count)) / avg * 100; + printf(" C++ kernel\t\t %.2f\t %.2f\t%.2f\t %.2f\n", + min_time, + max_time, + total_time / count, + sd); } } +#endif } void barrier(std::vector group, bool async_op) diff --git a/csrc/cpu/comm/shm.cpp b/csrc/cpu/comm/shm.cpp new file mode 100644 index 000000000000..859c2fec292d --- /dev/null +++ b/csrc/cpu/comm/shm.cpp @@ -0,0 +1,686 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include + +#include +#include +#include +#include +#include +#include "shm.h" + +// #define DO_PROFILE +#ifdef DO_PROFILE +#include +#include +#endif + +// states for collectives +enum coll_state { + coll_begin = 0, + coll_allreduce_naive__copy_in_done, // this state is for rank != 0 + coll_allreduce_naive__reduce_done, // this state is for rank == 0 + coll_allreduce_naive__copy_out_done, // this state is for rank != 0 +}; + +// SHM building blocks +struct SharedData { + const char* name; + int descriptor; + void* bytes; + size_t nbytes; +}; + +void shared_open(SharedData* data, const char* name, size_t nbytes) +{ + int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); + if (d != -1) { + void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0); + data->name = name; + data->descriptor = d; + data->bytes = bytes; + data->nbytes = nbytes; + } else { + if (errno != ENOENT) { + // don't print if shm can not be found because we want to loop over from + // caller again until the other ranks created the shm + printf("shared_open %s failed, errno=%d\n", name, errno); + } + data->descriptor = -1; + } +} + +void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes) +{ + int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (d != -1) { + if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); } + } else { + printf("shared_create %s failed\n", name); + } +} + +void shared_close(SharedData* data) +{ + if (data->descriptor != -1) { + munmap(data->bytes, data->nbytes); + shm_unlink(data->name); + } +} + +// SHM based allreduce helper functions +// buffer that holds shm name +#define NAME_BUF_SIZE 1000 +#define MAX_BUF_SIZE 1048576 * 32 +#define NAIVE_ALLREDUCE_THRESHOLD 1048576 +#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer" +struct allreduce_workspace { + enum coll_state state; + sem_t mutex; + sem_t turnstile1; + sem_t turnstile2; + int counter; + char buffer[MAX_BUF_SIZE]; +}; +struct allreduce_workspace** workspace; + +void wait_buffer_state_until(int index, enum coll_state state) +{ + volatile enum coll_state* state_ptr = &(workspace[index]->state); + + while (*state_ptr != state) + ; +} + +void wait_buffer_state_until_range(int index, enum coll_state start, int size) +{ + volatile enum coll_state* state_ptr = &(workspace[index]->state); + enum coll_state end = (enum coll_state)(start + size); + + while (1) { + volatile enum coll_state cur_state = *state_ptr; + if (cur_state >= start and cur_state < end) break; + } +} + +void wait_buffer_state_until_not(int index, enum coll_state state) +{ + volatile enum coll_state* state_ptr = &(workspace[index]->state); + + while (*state_ptr == state) + ; +} + +void barrier_wait(int root_idx, int num_ranks) +{ + // Phase 1: Wait for all threads to enter the barrier + auto shared = workspace[root_idx]; + sem_wait(&shared->mutex); + shared->counter++; + if (shared->counter == num_ranks) { + for (int i = 0; i < num_ranks; ++i) { sem_post(&shared->turnstile1); } + } + sem_post(&shared->mutex); + sem_wait(&shared->turnstile1); + + // Phase 2: Wait for all threads to exit the barrier + sem_wait(&shared->mutex); + shared->counter--; + if (shared->counter == 0) { + for (int i = 0; i < num_ranks; ++i) { sem_post(&shared->turnstile2); } + } + sem_post(&shared->mutex); + sem_wait(&shared->turnstile2); +} + +__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw"))); +inline __m512 cvt_bf16_to_fp32(const __m256i src) +{ + auto y = _mm512_cvtepu16_epi32(src); + return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2)); +} + +inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw"))); +inline __m256i cvt_fp32_to_bf16(const __m512 src) +{ + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +} + +void reduce_2_bf16_buffers_iio(int num_elements, void* in0, void* in1, void* out) + __attribute__((target("avx512bw"))); + +void reduce_bf16_buffers(int start_elements, + int num_elements, + int num_buffers, + int to_buffer_idx, + struct allreduce_workspace** workspace) + __attribute__((target("avx512bw"))); + +void reduce_2_fp32_buffers_iio(int num_elements, void* in0, void* in1, void* out) + __attribute__((target("avx512bw"))); + +void reduce_fp32_buffers(int start_elements, + int num_elements, + int num_buffers, + int to_buffer_idx, + struct allreduce_workspace** workspace) + __attribute__((target("avx512bw"))); + +// N_REDUCE_LIMIT is the number of buffers that can be reduced together in one shot. +// Compared with do N-1 2-reduces which needs 2*(N-1) read and N-1 write, +// N-reduce only needs N read and 1 write, this saves 2/3 memory bandwidth. +// When increase N_REDUCE_LIMIT to a bigger number, do the following steps +// 1. Extend REPEAT_ macros list down below +// 2. Extend switch cases which call "REPEAT(X, ...)" down below +#define N_REDUCE_LIMIT 16 + +void reduce_all_buffers(struct allreduce_workspace** workspace, + int start_elements, + int num_elements, + c10::ScalarType scalar_type, + int num_buffers, + int to_buffer_idx) +{ + switch (scalar_type) { + case c10::ScalarType::BFloat16: + if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) { + reduce_bf16_buffers( + start_elements, num_elements, num_buffers, to_buffer_idx, workspace); + } else { + for (int i = 0; i < num_buffers; i++) { + if (i == to_buffer_idx) continue; + reduce_2_bf16_buffers_iio( + num_elements, + workspace[i]->buffer + start_elements * 2, + workspace[to_buffer_idx]->buffer + start_elements * 2, + workspace[to_buffer_idx]->buffer + start_elements * 2); + } + } + break; + case c10::ScalarType::Float: + if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) { + reduce_fp32_buffers( + start_elements, num_elements, num_buffers, to_buffer_idx, workspace); + } else { + for (int i = 0; i < num_buffers; i++) { + if (i == to_buffer_idx) continue; + reduce_2_fp32_buffers_iio( + num_elements, + workspace[i]->buffer + start_elements * 4, + workspace[to_buffer_idx]->buffer + start_elements * 4, + workspace[to_buffer_idx]->buffer + start_elements * 4); + } + } + break; + default: assert(!"Should not get here"); + } +} + +#define REPEAT(N, x) REPEAT_##N(x) +#define REPEAT_1(x) x(1) +#define REPEAT_2(x) \ + REPEAT_1(x); \ + x(2) +#define REPEAT_3(x) \ + REPEAT_2(x); \ + x(3) +#define REPEAT_4(x) \ + REPEAT_3(x); \ + x(4) +#define REPEAT_5(x) \ + REPEAT_4(x); \ + x(5) +#define REPEAT_6(x) \ + REPEAT_5(x); \ + x(6) +#define REPEAT_7(x) \ + REPEAT_6(x); \ + x(7) +#define REPEAT_8(x) \ + REPEAT_7(x); \ + x(8) +#define REPEAT_9(x) \ + REPEAT_8(x); \ + x(9) +#define REPEAT_10(x) \ + REPEAT_9(x); \ + x(10) +#define REPEAT_11(x) \ + REPEAT_10(x); \ + x(11) +#define REPEAT_12(x) \ + REPEAT_11(x); \ + x(12) +#define REPEAT_13(x) \ + REPEAT_12(x); \ + x(13) +#define REPEAT_14(x) \ + REPEAT_13(x); \ + x(14) +#define REPEAT_15(x) \ + REPEAT_14(x); \ + x(15) + +#define CVT_ADD_BF16(x) \ + do { \ + auto in##x##_val = \ + cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x]->buffer + i))); \ + inout_val = _mm512_add_ps(inout_val, in##x##_val); \ + } while (0) + +// Reduce functions down below use vectorized algorithm, the number of bytes processed each +// iteration depends on vector length. 256bit vector ==> 32 bytes, 512bit vector ==> 64 bytes +// If you change implementation of reduce_2_bf16_buffers_iio or reduce_2_fp32_buffers_iio, check +// whether this number needs to be changed +#define VECTOR_LENGTH_IN_BYTES 32 + +void reduce_bf16_buffers(int start_elements, + int num_elements, + int num_buffers, + int to_buffer_idx, + struct allreduce_workspace** workspace) +{ + const int element_size = 2; + const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size; + int main_elements = num_elements - (num_elements % vector_length); + int remain_elements = num_elements % vector_length; + + // process aligned part +#pragma omp parallel for + for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size; + i += VECTOR_LENGTH_IN_BYTES) { + auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0]->buffer + i))); + switch (num_buffers) { + case 16: REPEAT(15, CVT_ADD_BF16); break; + case 15: REPEAT(14, CVT_ADD_BF16); break; + case 14: REPEAT(13, CVT_ADD_BF16); break; + case 13: REPEAT(12, CVT_ADD_BF16); break; + case 12: REPEAT(11, CVT_ADD_BF16); break; + case 11: REPEAT(10, CVT_ADD_BF16); break; + case 10: REPEAT(9, CVT_ADD_BF16); break; + case 9: REPEAT(8, CVT_ADD_BF16); break; + case 8: REPEAT(7, CVT_ADD_BF16); break; + case 7: REPEAT(6, CVT_ADD_BF16); break; + case 6: REPEAT(5, CVT_ADD_BF16); break; + case 5: REPEAT(4, CVT_ADD_BF16); break; + case 4: REPEAT(3, CVT_ADD_BF16); break; + case 3: REPEAT(2, CVT_ADD_BF16); break; + default: assert(!"Should not get here."); + } + _mm256_storeu_si256((__m256i*)(workspace[to_buffer_idx]->buffer + i), + cvt_fp32_to_bf16(inout_val)); + } + + // process remaining part + int i = (start_elements + main_elements) * element_size; + while (remain_elements > 0) { + float val = 0.0f; + for (int j = 0; j < num_buffers; j++) { val += *(at::BFloat16*)(workspace[j]->buffer + i); } + *(at::BFloat16*)(workspace[to_buffer_idx]->buffer + i) = val; + remain_elements--; + i += element_size; + } +} + +void reduce_2_bf16_buffers_iio(int num_elements, void* in0, void* in1, void* out) +{ + const int element_size = 2; + const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size; + int main_elements = num_elements - (num_elements % vector_length); + int remain_elements = num_elements % vector_length; + + // process aligned part +#pragma omp parallel for + for (int i = 0; i < main_elements * element_size; i += VECTOR_LENGTH_IN_BYTES) { + auto in0_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in0 + i))); + auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i))); + auto out_val = _mm512_add_ps(in0_val, in1_val); + _mm256_storeu_si256((__m256i*)((char*)out + i), cvt_fp32_to_bf16(out_val)); + } + + // process remaining part + int i = main_elements * element_size; + while (remain_elements > 0) { + float in0_val = *((at::BFloat16*)((char*)in0 + i)); + float in1_val = *((at::BFloat16*)((char*)in1 + i)); + *((at::BFloat16*)((char*)out + i)) = in0_val + in1_val; + remain_elements--; + i += element_size; + } +} + +#define CVT_ADD_F32(x) \ + do { \ + auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x]->buffer + i)); \ + inout_val = _mm256_add_ps(inout_val, in##x##_val); \ + } while (0) + +void reduce_fp32_buffers(int start_elements, + int num_elements, + int num_buffers, + int to_buffer_idx, + struct allreduce_workspace** workspace) +{ + const int element_size = 4; + const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size; + int main_elements = num_elements - (num_elements % vector_length); + int remain_elements = num_elements % vector_length; + + // process aligned part +#pragma omp parallel for + for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size; + i += VECTOR_LENGTH_IN_BYTES) { + auto inout_val = _mm256_loadu_ps((float*)(workspace[0]->buffer + i)); + switch (num_buffers) { + case 16: REPEAT(15, CVT_ADD_F32); break; + case 15: REPEAT(14, CVT_ADD_F32); break; + case 14: REPEAT(13, CVT_ADD_F32); break; + case 13: REPEAT(12, CVT_ADD_F32); break; + case 12: REPEAT(11, CVT_ADD_F32); break; + case 11: REPEAT(10, CVT_ADD_F32); break; + case 10: REPEAT(9, CVT_ADD_F32); break; + case 9: REPEAT(8, CVT_ADD_F32); break; + case 8: REPEAT(7, CVT_ADD_F32); break; + case 7: REPEAT(6, CVT_ADD_F32); break; + case 6: REPEAT(5, CVT_ADD_F32); break; + case 5: REPEAT(4, CVT_ADD_F32); break; + case 4: REPEAT(3, CVT_ADD_F32); break; + case 3: REPEAT(2, CVT_ADD_F32); break; + default: assert(!"Should not get here."); + } + _mm256_storeu_ps((float*)(workspace[to_buffer_idx]->buffer + i), inout_val); + } + + // process remaining part + int i = (start_elements + main_elements) * element_size; + while (remain_elements > 0) { + float val = 0.0f; + for (int j = 0; j < num_buffers; j++) { val += *(float*)(workspace[j]->buffer + i); } + *(float*)(workspace[to_buffer_idx]->buffer + i) = val; + remain_elements--; + i += element_size; + } +} + +void reduce_2_fp32_buffers_iio(int num_elements, void* in0, void* in1, void* out) +{ + const int element_size = 4; + const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size; + int main_elements = num_elements - (num_elements % vector_length); + int remain_elements = num_elements % vector_length; + + // process aligned part +#pragma omp parallel for + for (int i = 0; i < main_elements * element_size; i += VECTOR_LENGTH_IN_BYTES) { + auto in0_val = _mm256_loadu_ps((float*)((char*)in0 + i)); + auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i)); + auto out_val = _mm256_add_ps(in0_val, in1_val); + _mm256_storeu_ps((float*)((char*)out + i), out_val); + } + + // process remaining part + int i = main_elements * element_size; + while (remain_elements > 0) { + float in0_val = *((float*)((char*)in0 + i)); + float in1_val = *((float*)((char*)in1 + i)); + *((float*)((char*)out + i)) = in0_val + in1_val; + remain_elements--; + i += element_size; + } +} + +static bool is_initialized = 0; +static int world_size; +static int world_rank; + +void shm_initialize(int size, int rank, char* addr_string, char* port_string) +{ + if (is_initialized) return; + is_initialized = 1; + + world_size = size; + world_rank = rank; + + char shm_name_prefix[NAME_BUF_SIZE]; + char shm_name[NAME_BUF_SIZE]; + snprintf(shm_name_prefix, + NAME_BUF_SIZE, + "%s_%d_%s_%s", + SHM_BUFFER_NAME, + getuid(), + addr_string, + port_string); + // create shared workspace for SHM based allreduce + SharedData allreduce_buffer; + // allocate workspace_buf for current rank + struct allreduce_workspace* workspace_buf; + struct allreduce_workspace* workspace_buf_other; + workspace_buf = (struct allreduce_workspace*)malloc(sizeof(struct allreduce_workspace)); + snprintf(shm_name, NAME_BUF_SIZE, "%s_%d", shm_name_prefix, rank); + shared_create(&allreduce_buffer, shm_name, workspace_buf, sizeof(struct allreduce_workspace)); + workspace_buf = (struct allreduce_workspace*)allreduce_buffer.bytes; + workspace_buf->state = coll_begin; + + // create the workspace pointer list + workspace = (struct allreduce_workspace**)malloc(size * sizeof(struct allreduce_workspace*)); + + // map shm of all ranks + for (int i = 0; i < size; i++) { + if (i != rank) { + snprintf(shm_name, NAME_BUF_SIZE, "%s_%d", shm_name_prefix, i); + // printf("open %s, %d\n", shm_name, rank); + do { + shared_open(&allreduce_buffer, shm_name, sizeof(struct allreduce_workspace)); + } while (allreduce_buffer.descriptor == -1 && errno == ENOENT); + workspace_buf_other = (struct allreduce_workspace*)allreduce_buffer.bytes; + workspace[i] = workspace_buf_other; + } else { + workspace[i] = workspace_buf; + workspace_buf->counter = 0; + sem_init(&workspace_buf->mutex, 1, 1); + sem_init(&workspace_buf->turnstile1, 1, 0); + sem_init(&workspace_buf->turnstile2, 1, 0); + } + } +} + +static void parallel_memcpy(void* to, void* from, size_t n_bytes) + __attribute__((target("avx512bw"))); +static void parallel_memcpy(void* to, void* from, size_t n_bytes) +{ + auto aligned_bytes = n_bytes - (n_bytes % VECTOR_LENGTH_IN_BYTES); + // process aligned part +#pragma omp parallel for + for (int i = 0; i < aligned_bytes; i += VECTOR_LENGTH_IN_BYTES) { + auto val = _mm256_loadu_si256((__m256i*)((char*)from + i)); + _mm256_storeu_si256((__m256i*)((char*)to + i), val); + } + + // process remaining part + for (int i = aligned_bytes; i < n_bytes; i++) { *((char*)to + i) = *((char*)from + i); } +} + +#define positive_mod(num, mod) ((((num) % (mod)) + (mod)) % (mod)) +#define rank_mod(rank) positive_mod(rank, world_size) +size_t slice_size(size_t chunk_el, int slice_idx) +{ + size_t slice_size = chunk_el / world_size; + return slice_idx == world_size - 1 ? slice_size + (chunk_el % world_size) : slice_size; +} + +char* slice_data(char* data_ptr, size_t chunk_el, int el_size, int slice_idx) +{ + size_t slice_size = chunk_el / world_size; + size_t el_offset = slice_size * slice_idx; + return data_ptr + el_offset * el_size; +} + +size_t slice_el_start(size_t chunk_el, int slice_idx) +{ + size_t slice_size = chunk_el / world_size; + return slice_size * slice_idx; +} + +void naive_all_reduce(char* data_ptr, + c10::ScalarType scalar_type, + size_t chunk_size, + size_t chunk_el) +{ + parallel_memcpy(workspace[world_rank]->buffer, data_ptr, chunk_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__copy_in_done; + + if (world_rank == 0) { + // compute allreduce result on rank 0 + for (int i = 1; i < world_size; i++) { + // wait until the other rank copy the buffer + wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done); + } + reduce_all_buffers(workspace, 0, chunk_el, scalar_type, world_size, 0); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__reduce_done; + parallel_memcpy(data_ptr, workspace[0]->buffer, chunk_size); + } + if (world_rank != 0) { + wait_buffer_state_until(0, coll_allreduce_naive__reduce_done); + parallel_memcpy(data_ptr, workspace[0]->buffer, chunk_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__copy_out_done; + } + if (world_rank == 0) { + for (int i = 1; i < world_size; i++) { + wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done); + } + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_begin; + } + if (world_rank != 0) { + // if rank 0 spin too fast it could be in state 1 of next allreduce + // in this case wait_buffer_state_until(0, 0) may cause deadlock + // what we are certain is when rank 0 finishes the state won't be 2 + wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done); + workspace[world_rank]->state = coll_begin; + } +} + +// naive allreduce distributed, each rank do naive reduce on its slice +void distributed_naive_reduce(char* data_ptr, + c10::ScalarType scalar_type, + size_t chunk_size, + size_t chunk_el) +{ +#ifdef DO_PROFILE + static double total_t1_t0 = 0.0; + static double total_t2_t1 = 0.0; + static double total_t3_t2 = 0.0; + static double total_t4_t3 = 0.0; + static double total_t5_t4 = 0.0; + static int count = -16; // warmup + auto t0 = std::chrono::system_clock::now(); +#endif + + int data_size = chunk_size / chunk_el; + parallel_memcpy(workspace[world_rank]->buffer, data_ptr, chunk_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__copy_in_done; + +#ifdef DO_PROFILE + auto t1 = std::chrono::system_clock::now(); +#endif + + for (int i = 0; i < world_size; i++) { + // wait until all the other ranks copy the buffer + wait_buffer_state_until_range(i, coll_allreduce_naive__copy_in_done, 2); + } + +#ifdef DO_PROFILE + auto t2 = std::chrono::system_clock::now(); +#endif + + // reduce scatter + reduce_all_buffers(workspace, + slice_el_start(chunk_el, world_rank), + slice_size(chunk_el, world_rank), + scalar_type, + world_size, + world_rank); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__reduce_done; + +#ifdef DO_PROFILE + auto t3 = std::chrono::system_clock::now(); +#endif + + for (int i = 0; i < world_size; i++) { + int rank = (i + world_rank) % world_size; + // wait until the other rank reduce the buffer + wait_buffer_state_until_range(rank, coll_allreduce_naive__reduce_done, 2); + parallel_memcpy(slice_data(data_ptr, chunk_el, data_size, rank), + slice_data(workspace[rank]->buffer, chunk_el, chunk_size / chunk_el, rank), + slice_size(chunk_el, rank) * data_size); + } + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_allreduce_naive__copy_out_done; + +#ifdef DO_PROFILE + auto t4 = std::chrono::system_clock::now(); +#endif + + for (int i = 0; i < world_size; i++) { + wait_buffer_state_until_not(i, coll_allreduce_naive__reduce_done); + } + + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank]->state = coll_begin; + +#ifdef DO_PROFILE + auto t5 = std::chrono::system_clock::now(); + count++; + if (count > 0) { + total_t1_t0 += std::chrono::duration_cast(t1 - t0).count(); + total_t2_t1 += std::chrono::duration_cast(t2 - t1).count(); + total_t3_t2 += std::chrono::duration_cast(t3 - t2).count(); + total_t4_t3 += std::chrono::duration_cast(t4 - t3).count(); + total_t5_t4 += std::chrono::duration_cast(t5 - t4).count(); + if (world_rank == 0 && count == 1000) { + printf("distributed_naive_reduce time breakdown:\n"); + printf("\tcopy input buffer: %.2f\n", total_t1_t0 / count); + printf("\twait for copy: %.2f\n", total_t2_t1 / count); + printf("\treduce: %.2f\n", total_t3_t2 / count); + printf("\tcopy buffer to output: %.2f\n", total_t4_t3 / count); + printf("\twait finish: %.2f\n", total_t5_t4 / count); + } + } +#endif +} + +void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size) +{ + for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) { + auto data_ptr = ((char*)(data.data_ptr()) + offset); + size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset; + size_t chunk_el = chunk_size / (data_size / numel); + if (chunk_size < NAIVE_ALLREDUCE_THRESHOLD) + naive_all_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el); + else + distributed_naive_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el); + } +} diff --git a/csrc/cpu/comm/shm.h b/csrc/cpu/comm/shm.h new file mode 100644 index 000000000000..7f73197a8caa --- /dev/null +++ b/csrc/cpu/comm/shm.h @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#ifndef __SHM_COLLECTIVES__ +#define __SHM_COLLECTIVES__ +#define VECTOR_LENGTH_IN_BYTES 32 +void shm_initialize(int size, int rank, char* addr_string, char* port_string); +void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size); +void barrier_wait(int root_idx, int num_ranks); +#endif diff --git a/csrc/fp_quantizer/includes/context.h b/csrc/fp_quantizer/includes/context.h new file mode 100644 index 000000000000..5bd9badbcb4f --- /dev/null +++ b/csrc/fp_quantizer/includes/context.h @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#pragma once + +#include +#include +#include +#include +#include +#include "cublas_v2.h" +#include "cuda.h" +#include "curand.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#define WARP_SIZE 32 + +class FPContext { +public: + FPContext() : _seed(42) + { + curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(_gen, 123); + } + + virtual ~FPContext() {} + + static FPContext& Instance() + { + static FPContext _ctx; + return _ctx; + } + + curandGenerator_t& GetRandGenerator() { return _gen; } + + cudaStream_t GetCurrentStream() + { + // get current pytorch stream. + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + return stream; + } + + std::pair IncrementOffset(uint64_t offset_inc) + { + uint64_t offset = _curr_offset; + _curr_offset += offset_inc; + return std::pair(_seed, offset); + } + + void SetSeed(uint64_t new_seed) { _seed = new_seed; } + +private: + curandGenerator_t _gen; + cublasHandle_t _cublasHandle; + uint64_t _seed; + uint64_t _curr_offset; +}; diff --git a/csrc/fp_quantizer/includes/quantize.h b/csrc/fp_quantizer/includes/quantize.h new file mode 100644 index 000000000000..2204c1ba74fc --- /dev/null +++ b/csrc/fp_quantizer/includes/quantize.h @@ -0,0 +1,115 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#pragma once + +#include +#include + +#include + +#include +#include +#include + +#define QUANT_SWITCH(Q_BITS, ...) \ + [&] { \ + if (12 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 0; \ + constexpr int CONST_Q_BITS = 8; \ + constexpr int CONST_Q_MANTISA_BITS = 3; \ + __VA_ARGS__(); \ + } else if (13 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 1; \ + constexpr int CONST_Q_BITS = 8; \ + constexpr int CONST_Q_MANTISA_BITS = 3; \ + __VA_ARGS__(); \ + } else if (10 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 0; \ + constexpr int CONST_Q_BITS = 8; \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + __VA_ARGS__(); \ + } else if (11 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 1; \ + constexpr int CONST_Q_BITS = 8; \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + __VA_ARGS__(); \ + } else if (28 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 0; \ + constexpr int CONST_Q_BITS = 12; \ + constexpr int CONST_Q_MANTISA_BITS = 7; \ + __VA_ARGS__(); \ + } else if (29 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 1; \ + constexpr int CONST_Q_BITS = 12; \ + constexpr int CONST_Q_MANTISA_BITS = 7; \ + __VA_ARGS__(); \ + } else if (6 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 0; \ + constexpr int CONST_Q_BITS = 6; \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + __VA_ARGS__(); \ + } else if (7 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 1; \ + constexpr int CONST_Q_BITS = 6; \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + __VA_ARGS__(); \ + } else if (2 == Q_BITS) { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 0; \ + constexpr int CONST_Q_BITS = 4; \ + constexpr int CONST_Q_MANTISA_BITS = 1; \ + __VA_ARGS__(); \ + } else { \ + constexpr int CONST_STOCHASTIC_ROUNDING = 1; \ + constexpr int CONST_Q_BITS = 4; \ + constexpr int CONST_Q_MANTISA_BITS = 1; \ + __VA_ARGS__(); \ + } \ + }() + +#define DEQUANT_SWITCH(Q_MANTISA_EXPONENT_BITS, ...) \ + [&] { \ + if (12 == Q_MANTISA_EXPONENT_BITS) { \ + constexpr int CONST_Q_MANTISA_BITS = 3; \ + constexpr int CONST_Q_EXPONENT_BITS = 4; \ + __VA_ARGS__(); \ + } else if (10 == Q_MANTISA_EXPONENT_BITS) { \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + constexpr int CONST_Q_EXPONENT_BITS = 5; \ + __VA_ARGS__(); \ + } else if (28 == Q_MANTISA_EXPONENT_BITS) { \ + constexpr int CONST_Q_MANTISA_BITS = 7; \ + constexpr int CONST_Q_EXPONENT_BITS = 4; \ + __VA_ARGS__(); \ + } else if (6 == Q_MANTISA_EXPONENT_BITS) { \ + constexpr int CONST_Q_MANTISA_BITS = 2; \ + constexpr int CONST_Q_EXPONENT_BITS = 3; \ + __VA_ARGS__(); \ + } else { \ + constexpr int CONST_Q_MANTISA_BITS = 1; \ + constexpr int CONST_Q_EXPONENT_BITS = 2; \ + __VA_ARGS__(); \ + } \ + }() + +template +void launch_quantization(T* val, + uint8_t* q_val, + int num_groups, + int group_size, + cudaStream_t stream, + float q_range, + int q_bits, + int q_mantisa_bits, + int stochastic_rounding); + +template +void launch_dequantization(uint8_t* val, + T* q_val, + int num_groups, + int group_size, + int q_mantisa_bits, + int q_exponent_bits, + cudaStream_t stream); diff --git a/csrc/fp_quantizer/quantize.cpp b/csrc/fp_quantizer/quantize.cpp new file mode 100644 index 000000000000..4a88ff767636 --- /dev/null +++ b/csrc/fp_quantizer/quantize.cpp @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include "quantize.h" + +#include +#include +#include + +#define DISPATCH_QUANTIZE(T_TYPE, C_TYPE, mantisa, exponent) \ + if (val.options().dtype() == torch::T_TYPE) { \ + launch_quantization((C_TYPE*)val.data_ptr(), \ + (uint8_t*)out.data_ptr(), \ + num_groups, \ + group_size, \ + at::cuda::getCurrentCUDAStream(), \ + q_range, \ + q_bits, \ + q_mantisa_bits, \ + stochastic_rounding); \ + } + +at::Tensor quantize(torch::Tensor& val, + int group_size, + int stochastic_rounding, + int q_bits, + int q_mantisa_bits) +{ + int total_elems = at::numel(val); + auto options = at::TensorOptions() + .dtype(torch::kInt8) + .layout(val.layout()) + .device(val.device()) + .requires_grad(false); + float q_range = q_bits == 8 ? (q_mantisa_bits == 3 ? 480.0 : 114688.0) : // fp8 ranges + (q_bits == 12 ? 510.0 : // fp12 range + (q_bits == 6 ? 28.0 : // fp6 range + 6.0)); // fp4 range (using power 2); TODO (Reza): add the power-4 + // in case accuracy is not matching! + int num_groups = total_elems / group_size; + auto out = torch::empty({num_groups, group_size * q_bits / 8 + 4}, options); + + DISPATCH_QUANTIZE(kHalf, __half, 23, 8); +#ifdef BF16_AVAILABLE + DISPATCH_QUANTIZE(kBFloat16, __nv_bfloat16, 23, 8); +#endif + + return out; +} + +#define DISPATCH_DEQUANTIZE(T_TYPE, C_TYPE, mantisa) \ + if (val.options().dtype() == torch::T_TYPE) { \ + launch_dequantization((uint8_t*)val_q.data_ptr(), \ + (C_TYPE*)val.data_ptr(), \ + num_groups, \ + group_size, \ + q_mantisa_bits, \ + q_exponent_bits, \ + at::cuda::getCurrentCUDAStream()); \ + return; \ + } + +void dequantize(torch::Tensor& val, + torch::Tensor& val_q, + int group_size, + int q_mantisa_bits, + int q_exponent_bits) +{ + int total_elems = at::numel(val); + + int num_groups = total_elems / group_size; + + DISPATCH_DEQUANTIZE(kHalf, __half, 10); +#ifdef BF16_AVAILABLE + DISPATCH_DEQUANTIZE(kBFloat16, __nv_bfloat16, 7); +#endif +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("quantize", &quantize, "quantize function"); + m.def("dequantize", &dequantize, "dequantize function"); +} diff --git a/csrc/fp_quantizer/quantize.cu b/csrc/fp_quantizer/quantize.cu new file mode 100644 index 000000000000..5f0b58f124f0 --- /dev/null +++ b/csrc/fp_quantizer/quantize.cu @@ -0,0 +1,395 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 + +// DeepSpeed Team + +#include +#include "context.h" +#include "memory_access_utils.h" +#include "quantize.h" +#include "reduction_utils.h" + +#include +#include + +#include +#include + +#include +#include + +using ROp = reduce::ROpType; + +namespace quantization { + +constexpr int access_granularity = 16; +constexpr int quanitzed_access_granularity = 4; +constexpr int quanitzed_access_granularity_6bits = 2; +constexpr int threads = 256; +constexpr int warps = threads / 32; + +} // namespace quantization + +template +__device__ void round(uint32_t& mantisa, uint32_t& dst_exponent, curandStatePhilox4_32_10_t* state) +{ + constexpr uint32_t mantisa_mask = (1 << (_mantisa_bits - q_mantisa_bits)) - 1; + uint32_t offset = stochastic_rounding ? (curand_poisson(state, 10) & mantisa_mask) + : 1 << (_mantisa_bits - q_mantisa_bits - 1); + mantisa += offset; + dst_exponent += (((mantisa & ~mantisa_mask) == (1 << _mantisa_bits)) ? 1 : 0); +} + +template +__device__ void clip(uint32_t& exponent, uint32_t& mantisa) +{ + constexpr uint32_t max_exponent = (1 << (q_exponent_bits - 1)) + (1 << (_exponent_bits - 1)); + constexpr uint32_t min_exponent = + (1 << (_exponent_bits - 1)) - ((1 << (q_exponent_bits - 1)) - 1); + if (exponent > max_exponent) { + exponent = max_exponent; + mantisa = (((uint32_t)-1) >> (32 - q_mantisa_bits)) << 1; //.11 .. 10 + } + if (exponent < min_exponent) { + exponent = min_exponent; + mantisa = 0; + } +} + +template +__global__ void apply_quantization(T* val, + uint8_t* q_val, + int group_size, + std::pair seed, + float q_range) +{ + int tidx = threadIdx.x; + int wid = tidx >> 5; + int lane = tidx & 0x1f; + int gid = blockIdx.x * quantization::warps + wid; + + constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1; + constexpr uint32_t _mantisa_mask = (1 << _mantisa_bits) - 1; + constexpr uint32_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits; + constexpr uint32_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits); + // CG helpers + cg::thread_block tb = cg::this_thread_block(); + cg::thread_block_tile warp = cg::tiled_partition(tb); + + constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T); + constexpr uint32_t load_stride = vector_size * hw_warp_size; + constexpr uint32_t store_stride = (total_q_bits * vector_size / 8) * hw_warp_size; + const uint32_t thread_offset = lane * vector_size; + const uint32_t store_thread_offset = lane * (total_q_bits * vector_size / 8); + const uint32_t base_load_offset = gid * group_size + thread_offset; + const uint32_t base_store_offset = + gid * ((group_size * total_q_bits / 8) + 4) + + store_thread_offset; // 4-byte for saving the scale per group + const T* load_base_ptr = val + base_load_offset; + T tmp_buf[unroll * vector_size]; + T cur_max; + reduce::init(&cur_max); + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(seed.first, idx, seed.second, &state); + +#pragma unroll + for (int i = 0; i < unroll; i++) { + if (i * load_stride + thread_offset < group_size) { + mem_access::load_global( + &tmp_buf[vector_size * i], load_base_ptr + i * load_stride); + for (int j = 0; j < vector_size; j++) + cur_max = reduce::element(cur_max, __habs(tmp_buf[i * vector_size + j])); + } + } + reduce::_block(tb, warp, &cur_max); + + int mantisa_mask = ((1 << q_mantisa_bits) - 1); + mantisa_mask <<= (_mantisa_bits - q_mantisa_bits); + + uint8_t* store_base_ptr = q_val + base_store_offset; + float scale = (float)q_range / conversion::to(cur_max); +#pragma unroll + for (int i = 0; i < unroll; i++) { + if (i * load_stride + thread_offset < group_size) { + uint64_t q_buf = 0; + uint64_t q_buf1 = 0; +#pragma unroll + for (int j = 0; j < vector_size; j++) { + float val_f = conversion::to(tmp_buf[i * vector_size + j]) * scale; + uint32_t* data = reinterpret_cast(&val_f); + uint32_t sign = (data[0] & _sign_mask) >> (_mantisa_bits + _exponent_bits); + uint32_t cur_exponent = (data[0] & _exponent_mask) >> _mantisa_bits; + uint32_t dst_mantisa = (data[0] & _mantisa_mask); + + uint32_t dst_exponent = cur_exponent; + + round<_mantisa_bits, q_mantisa_bits, stochastic_rounding>( + dst_mantisa, dst_exponent, &state); + if (cur_exponent != 0) + clip<_mantisa_bits, _exponent_bits, q_mantisa_bits, q_exponent_bits>( + dst_exponent, dst_mantisa); + + dst_mantisa = (dst_mantisa & mantisa_mask) >> (_mantisa_bits - q_mantisa_bits); + + if (dst_exponent != (1 << q_exponent_bits) - 1) + dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) + + (1 << (q_exponent_bits - 1)) - 1; + if (total_q_bits == 8 || total_q_bits == 4 || total_q_bits == 6) + q_buf = q_buf | + ((uint64_t)((uint8_t)(sign << (q_exponent_bits + q_mantisa_bits) | + (dst_exponent << q_mantisa_bits) | dst_mantisa)) + << j * total_q_bits); + else if (total_q_bits == 12) { + if (j < 5) + q_buf = + q_buf | + ((uint64_t)((uint16_t)(sign << (q_exponent_bits + q_mantisa_bits) | + (dst_exponent << q_mantisa_bits) | dst_mantisa)) + << j * total_q_bits); + else + q_buf1 = + q_buf1 | + ((uint64_t)((uint16_t)(sign << (q_exponent_bits + q_mantisa_bits) | + (dst_exponent << q_mantisa_bits) | dst_mantisa)) + << (j - 5) * total_q_bits); + } + } + if (total_q_bits == 12) { + uint64_t last_nibble_mask = 0xf; + last_nibble_mask = q_buf1 & last_nibble_mask; + q_buf = (last_nibble_mask << 60) | q_buf; + q_buf1 >>= 4; + } + uint8_t* int8_data = reinterpret_cast(&q_buf); + uint8_t* int8_data1 = reinterpret_cast(&q_buf1); + if (total_q_bits == 6) { + mem_access::store_global( + store_base_ptr + i * store_stride, int8_data); + mem_access::store_global( + store_base_ptr + i * store_stride + + quantization::quanitzed_access_granularity_6bits, + int8_data + quantization::quanitzed_access_granularity_6bits); + mem_access::store_global( + store_base_ptr + i * store_stride + + quantization::quanitzed_access_granularity_6bits * 2, + int8_data + 2 * quantization::quanitzed_access_granularity_6bits); + } else { + mem_access::store_global( + store_base_ptr + i * store_stride, int8_data); + + if (total_q_bits > 4) { + mem_access::store_global( + store_base_ptr + i * store_stride + + quantization::quanitzed_access_granularity, + int8_data + quantization::quanitzed_access_granularity); + if (total_q_bits == 12) { + mem_access::store_global( + store_base_ptr + i * store_stride + + quantization::quanitzed_access_granularity * 2, + int8_data1); + } + } + } + } + } + if (lane == 0) { + float q_scale = conversion::to(cur_max) / (float)q_range; + uint8_t* scale_as_int8 = reinterpret_cast(&q_scale); + uint32_t scale_offset = + gid * ((group_size * total_q_bits / 8) + 4) + (group_size * total_q_bits / 8); + if (total_q_bits != 6) + mem_access::store_global( + q_val + scale_offset, scale_as_int8); + else { + mem_access::store_global( + q_val + scale_offset, scale_as_int8); + mem_access::store_global( + q_val + scale_offset + quantization::quanitzed_access_granularity_6bits, + scale_as_int8 + quantization::quanitzed_access_granularity_6bits); + } + } +} + +template +__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int total_num_elements) +{ + constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T); + int tidx = (blockIdx.x * blockDim.x + threadIdx.x) * vector_size; + + constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1; + constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1; + constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1; + constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits; + constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits); + const uint32_t g_index = (tidx / group_size); + const uint32_t group_size_bytes = (group_size * quantized_bits / 8); + const uint8_t* load_base_ptr = + val + g_index * (group_size_bytes + 4) + (tidx % group_size) * quantized_bits / 8; + + int mantisa_mask = ((1 << q_mantisa_bits) - 1); + mantisa_mask <<= (_mantisa_bits - q_mantisa_bits); + + T* store_base_ptr = q_val + tidx; + float scale; + + uint8_t* scale_as_int8 = reinterpret_cast(&scale); + if (quantized_bits == 6) { + mem_access::load_global( + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); + mem_access::load_global( + scale_as_int8 + quantization::quanitzed_access_granularity_6bits, + val + g_index * (group_size_bytes + 4) + group_size_bytes + + quantization::quanitzed_access_granularity_6bits); + } else + mem_access::load_global( + scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes); + + if (tidx < total_num_elements) { + uint64_t q_buf_in; + uint64_t q_buf_in1; + uint8_t* int8_data = reinterpret_cast(&q_buf_in); + uint8_t* int8_data1 = reinterpret_cast(&q_buf_in1); + if (quantized_bits == 6) { + mem_access::load_global( + int8_data, load_base_ptr); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits, + load_base_ptr + quantization::quanitzed_access_granularity_6bits); + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity_6bits * 2, + load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2); + } else { + mem_access::load_global(int8_data, + load_base_ptr); + if (quantized_bits > 4) { + mem_access::load_global( + int8_data + quantization::quanitzed_access_granularity, + load_base_ptr + quantization::quanitzed_access_granularity); + if (quantized_bits == 12) { + mem_access::load_global( + int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2); + } + } + } + T store_buf[vector_size]; + uint16_t* q_buf = reinterpret_cast(store_buf); +#pragma unroll + for (int j = 0; j < vector_size; j++) { + uint16_t new_data; + if (j < 5 || quantized_bits != 12) { + new_data = (uint16_t)(q_buf_in >> (j * quantized_bits)); + } else { + if (j == 5) { + new_data = (uint16_t)(q_buf_in1); + new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60)); + } else + new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8)); + } + + uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits); + uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits; + uint16_t dst_mantisa = (new_data & _mantisa_mask); + + if (dst_exponent != (1 << q_exponent_bits) - 1) + dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) + + (1 << (q_exponent_bits - 1)) - 1; + + q_buf[j] = + ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) | + (dst_mantisa << (q_mantisa_bits - _mantisa_bits))); + float up_cast = conversion::to(store_buf[j]); + store_buf[j] = conversion::to(up_cast * scale); + } + mem_access::store_global(store_base_ptr, store_buf); + } +} + +#define LAUNCH_FOR_QUANTIZATION_UNROLL(COUNT) \ + case COUNT: \ + apply_quantization \ + <<>>(val, q_val, group_size, seed, q_range); \ + break; + +template +void launch_quantization(T* val, + uint8_t* q_val, + int num_groups, + int group_size, + cudaStream_t stream, + float q_range, + int q_bits, + int q_mantisa_bits, + int stochastic_rounding) +{ + const dim3 grid((num_groups + quantization::warps - 1) / quantization::warps); + const dim3 block(quantization::threads); + + std::pair seed = FPContext::Instance().IncrementOffset(16); + + constexpr int vals_per_unroll = hw_warp_size * quantization::access_granularity / sizeof(T); + + const int copy_unroll = (group_size + vals_per_unroll - 1) / vals_per_unroll; + QUANT_SWITCH((q_bits - q_mantisa_bits - 1) * q_mantisa_bits + stochastic_rounding, [&] { + switch (copy_unroll) { + LAUNCH_FOR_QUANTIZATION_UNROLL(1) + LAUNCH_FOR_QUANTIZATION_UNROLL(2) + LAUNCH_FOR_QUANTIZATION_UNROLL(3) + LAUNCH_FOR_QUANTIZATION_UNROLL(4) + LAUNCH_FOR_QUANTIZATION_UNROLL(5) + LAUNCH_FOR_QUANTIZATION_UNROLL(6) + } + }); +} +#define INSTANTIATE_LAUNCH_QUANTIZATION(T, mantisa, exponent) \ + template void launch_quantization( \ + T*, uint8_t*, int, int, cudaStream_t, float q_range, int, int, int); +// fp8(E4M3), nearest-rounding +#ifdef BF16_AVAILABLE +INSTANTIATE_LAUNCH_QUANTIZATION(__nv_bfloat16, 23, 8); +#endif +INSTANTIATE_LAUNCH_QUANTIZATION(__half, 23, 8); + +template +void launch_dequantization(uint8_t* val, + T* q_val, + int num_groups, + int group_size, + int q_mantisa_bits, + int q_exponent_bits, + cudaStream_t stream) +{ + int blocks = ((num_groups * group_size) - 1) / + (quantization::threads * (quantization::access_granularity / sizeof(T))) + + 1; + const dim3 grid(blocks); + const dim3 block(quantization::threads); + DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] { + apply_dequantization + <<>>(val, q_val, group_size, (num_groups * group_size)); + }); +} +#define INSTANTIATE_LAUNCH_DEQUANTIZATION(T, mantisa) \ + template void launch_dequantization(uint8_t*, T*, int, int, int, int, cudaStream_t); +// fp8(E4M3) +#ifdef BF16_AVAILABLE +INSTANTIATE_LAUNCH_DEQUANTIZATION(__nv_bfloat16, 7); +#endif +INSTANTIATE_LAUNCH_DEQUANTIZATION(__half, 10); diff --git a/csrc/includes/memory_access_utils.h b/csrc/includes/memory_access_utils.h index 6789714d27c7..cb990b75bbe8 100644 --- a/csrc/includes/memory_access_utils.h +++ b/csrc/includes/memory_access_utils.h @@ -868,6 +868,35 @@ __device__ __forceinline__ void store_global<4, StorePolicy::CacheStreaming>(voi #endif } +template <> +__device__ __forceinline__ void store_global<2>(void* dst, const void* src) +{ + const int16_t* data = reinterpret_cast(src); + + int16_t* dst_cast = reinterpret_cast(dst); + dst_cast[0] = data[0]; +} + +template <> +__device__ __forceinline__ void store_global<2, StorePolicy::CacheGlobal>(void* dst, + const void* src) +{ + const int16_t* data = reinterpret_cast(src); + + int16_t* dst_cast = reinterpret_cast(dst); + dst_cast[0] = data[0]; +} + +template <> +__device__ __forceinline__ void store_global<2, StorePolicy::CacheStreaming>(void* dst, + const void* src) +{ + const int16_t* data = reinterpret_cast(src); + + int16_t* dst_cast = reinterpret_cast(dst); + dst_cast[0] = data[0]; +} + /////////// Store Shared /////////// template <> diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h index eb8efab77ac1..eb9afb66a894 100644 --- a/csrc/includes/reduction_utils.h +++ b/csrc/includes/reduction_utils.h @@ -159,6 +159,12 @@ DS_D_INLINE float element(const float lhs, const float rhs) return lhs + rhs; } +template <> +DS_D_INLINE double element(const double lhs, const double rhs) +{ + return lhs + rhs; +} + template <> DS_D_INLINE float element(const float lhs, const float rhs) { @@ -189,6 +195,19 @@ DS_D_INLINE __half element(const __half lhs, const __half rhs) #endif } +#ifdef BF16_AVAILABLE +template <> +DS_D_INLINE __nv_bfloat16 element(const __nv_bfloat16 lhs, const __nv_bfloat16 rhs) +{ +#if __CUDA_ARCH__ >= 800 + // Intrinsic limited to Ampere + newer + return __hmax(lhs, rhs); +#else + return (lhs > rhs) ? lhs : rhs; +#endif +} +#endif + template <> DS_D_INLINE __half element(const __half lhs, const __half rhs) { @@ -220,6 +239,21 @@ DS_D_INLINE __half2 element(const __half2 lhs, const __half2 rhs) #endif } +#ifdef BF16_AVAILABLE +template <> +DS_D_INLINE __nv_bfloat162 element(const __nv_bfloat162 lhs, const __nv_bfloat162 rhs) +{ +#if __CUDA_ARCH__ >= 800 + return __hmax2(lhs, rhs); +#else + __nv_bfloat162 ret_val; + ret_val.x = (lhs.x > rhs.x) ? lhs.x : rhs.x; + ret_val.y = (lhs.y > rhs.y) ? lhs.y : rhs.y; + return ret_val; +#endif +} +#endif + template <> DS_D_INLINE __half2 element(const __half2 lhs, const __half2 rhs) { @@ -295,6 +329,11 @@ DS_D_INLINE float init() { return 0.0f; } +template <> +DS_D_INLINE double init() +{ + return (double)0.0f; +} template <> DS_D_INLINE float init() @@ -331,6 +370,15 @@ DS_D_INLINE __half init() return __half(neg_inf); } +#ifdef BF16_AVAILABLE +template <> +DS_D_INLINE __nv_bfloat16 init() +{ + constexpr __nv_bfloat16_raw neg_inf = {0xFF80}; + return __nv_bfloat16(neg_inf); +} +#endif + template <> DS_D_INLINE __half2 init() { diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 6c7aa8b15ef9..fe0043547860 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -26,6 +26,7 @@ from . import module_inject from .accelerator import get_accelerator +from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER from .runtime.hybrid_engine import DeepSpeedHybridEngine @@ -42,7 +43,6 @@ from .comm.comm import init_distributed from .runtime import zero -from .runtime import DeepSpeedOptimizer, ZeROOptimizer from .runtime.compiler import is_compile_supported from .pipe import PipelineModule @@ -72,6 +72,7 @@ def initialize(args=None, model_parameters: Optional[torch.nn.Module] = None, training_data: Optional[torch.utils.data.Dataset] = None, lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None, + distributed_port: int = TORCH_DISTRIBUTED_DEFAULT_PORT, mpu=None, dist_init_required: Optional[bool] = None, collate_fn=None, @@ -96,6 +97,8 @@ def initialize(args=None, lr_scheduler: Optional: Learning Rate Scheduler Object or a Callable that takes an Optimizer and returns a Scheduler object. The scheduler object should define a get_lr(), step(), state_dict(), and load_state_dict() methods + distributed_port: Optional: Master node (rank 0)'s free port that needs to be used for communication during distributed training + mpu: Optional: A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}() @@ -137,7 +140,9 @@ def initialize(args=None, global dist from deepspeed import comm as dist dist_backend = get_accelerator().communication_backend_name() - dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required) + dist.init_distributed(dist_backend=dist_backend, + distributed_port=distributed_port, + dist_init_required=dist_init_required) # Set config using config_params for backwards compat if config is None and config_params is not None: diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py index 8c9a5fa85bf2..b851353520fb 100644 --- a/deepspeed/autotuning/utils.py +++ b/deepspeed/autotuning/utils.py @@ -42,7 +42,7 @@ def find_replace_str(value, replace_dict): if not isinstance(value, str): return str(value) - matches = re.findall(r"\$[A-Za-z0-9_]+", value) + matches = re.findall(r"\$[\w]+", value) for var in matches: var_key = var.replace("$", "").lower() if var_key == "nvme_path": diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py index f809a0c39270..b3f199a67b98 100644 --- a/deepspeed/checkpoint/constants.py +++ b/deepspeed/checkpoint/constants.py @@ -16,6 +16,7 @@ BASE_OPTIMIZER_STATE = 'base_optimizer_state' BASE_OPTIMIZER_STATE_STEP = 'base_optimizer_state_step' SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups" +PARAM_GROUPS = 'param_groups' GROUP_PADDINGS = 'group_paddings' PARTITION_COUNT = 'partition_count' ZERO_STAGE = 'zero_stage' diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index f40c5630899d..9ec5d0b169e4 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -22,6 +22,7 @@ OPTIMIZER_STATE_DICT, BASE_OPTIMIZER_STATE, SINGLE_PARTITION_OF_FP32_GROUPS, + PARAM_GROUPS, PARAM_SLICE_MAPPINGS, PARAM_SHAPES, PARAM, @@ -110,6 +111,9 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D): fp32=fp32_groups[param_group_id], ) + if "step" in state_groups[param_group_id]: + flat_state["step"] = state_groups[param_group_id]["step"] + for name, fragment_mapping in param_slice_mappings[param_group_id].items(): if pp_index > 0 and any(re.match(pattern, name) for pattern in pipeline_replicated_params): # Skip tied weights that are replicated in first and last pp stages @@ -138,8 +142,10 @@ def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, #print(f"{param_name}: {offset}: {numel} => {path}") - t = state_flat_tensor.narrow(0, offset, numel).clone() - _save_checkpoint(path, t) + # State might be a python int or a tensor + if state_name != "step" and torch.is_tensor(state_flat_tensor): + state_flat_tensor = state_flat_tensor.narrow(0, offset, numel).clone() + _save_checkpoint(path, state_flat_tensor) def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): @@ -147,8 +153,17 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): for tp_index in range(tp_degree): prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") paths = sorted(list(glob.glob(f"{prefix_path}.*"))) + if len(paths) == 0: + continue + shards = [torch.load(p) for p in paths] - slice = torch.cat(shards, dim=0).reshape(slice_shape) + + if state == "step": + assert all(v == shards[0] for v in shards), "All shards must have the same step value" + slice = shards[0] + else: + slice = torch.cat(shards, dim=0).reshape(slice_shape) + slices.append(slice) return slices @@ -177,6 +192,10 @@ def get_matched_pattern(patterns_, name_): return pattern_ return None + step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape) + if step_merged: + _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0]) + for state in ("fp32", "exp_avg", "exp_avg_sq"): slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) final_path = os.path.join(param_base_path, f"{state}.pt") @@ -227,13 +246,21 @@ def _get_chunks(l, n): def _do_parallel_work(do_work, work_chunks, num_workers): - pool = multiprocessing.Pool(num_workers) - results = [] - for batch in tqdm.tqdm(work_chunks): - res = pool.map(do_work, batch) - results.extend(res) - pool.close() - pool.join() + if num_workers > 1: + pool = multiprocessing.Pool(num_workers) + results = [] + for batch in tqdm.tqdm(work_chunks): + res = pool.map(do_work, batch) + results.extend(res) + pool.close() + pool.join() + else: + # No parallel pass for unit testing + # We can't create child processes in tests + results = [] + for batch in tqdm.tqdm(work_chunks): + res = [do_work(x) for x in batch] + results.extend(res) return results @@ -273,6 +300,7 @@ def _save_optimizer_state(args, ds_checkpoint): optim_sd = sd[OPTIMIZER_STATE_DICT] output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states} + output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS] zero_output_folder = os.path.join(args.output_folder, "zero") output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt") _save_checkpoint(output_file_path, output_sd) @@ -283,10 +311,9 @@ def _check_for_required_state(ds_checkpoint): assert universal_checkpoint_info is not None, f'Required {UNIVERSAL_CHECKPOINT_INFO} state is missing in checkpoint. Verify that client creates this state.' -def main(): +def main(args): print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') - args = parse_arguments() print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}') ds_checkpoint = DeepSpeedCheckpoint(args.input_folder) @@ -332,4 +359,5 @@ def main(): if __name__ == "__main__": - main() + args = parse_arguments() + main(args) diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py index 15b6ce28b2fd..137607721ebf 100644 --- a/deepspeed/checkpoint/reshape_utils.py +++ b/deepspeed/checkpoint/reshape_utils.py @@ -4,9 +4,10 @@ # DeepSpeed Team import os +import re import torch from collections import OrderedDict -from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX) +from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX, MODEL_FILE_PREFIX) def basic_folder_validation(dir): @@ -38,12 +39,28 @@ def get_files(dir): return file_list +def sort_zero_files(files, prefix): + pattern = f"{prefix}([0-9]+)_{MODEL_FILE_PREFIX}([0-9]+)" + rank_pairs = [] + for f in files: + m = re.search(pattern, f) + if m: + dp_rank = int(m.group(1)) + mp_rank = int(m.group(2)) + rank_pairs.append((dp_rank, mp_rank, f)) + else: + raise ValueError(f"Cannot parse dp_rank and mp_rank from {f}") + + sorted_files = sorted(rank_pairs, key=lambda x: (x[0], x[1])) + return [f for _, _, f in sorted_files] + + def get_zero_files(dir): file_list = get_files(dir) for prefix in [ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX]: zero_files = get_files_with_prefix(file_list, prefix) if len(zero_files) > 0: - return zero_files + return sort_zero_files(zero_files, prefix) return [] diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py index 542d1125c566..86c8dc904b8c 100644 --- a/deepspeed/checkpoint/universal_checkpoint.py +++ b/deepspeed/checkpoint/universal_checkpoint.py @@ -4,6 +4,7 @@ # DeepSpeed Team import os +import re import torch import types from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS) @@ -11,16 +12,25 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): hp_mapping = self._hp_mapping - optim_state_keys = hp_mapping.get_optim_state_keys() - hp_keys = [FP32_WEIGHT_KEY] + optim_state_keys - #print(f'{hp_keys=}') - checkpoint_files = {key: os.path.join(folder, f"{key}.pt") for key in hp_keys} - for file in checkpoint_files.values(): - assert os.path.isfile(file), f'{file} is not a valid file' + hp_mapping.optim_fragment = {} + hp_keys = [] + for file in os.listdir(folder): + # We expect files named something like "exp_avg.pt", "exp_avg_sq.pt", "fp32.pt" + pattern = r'(.+).pt' + match = re.search(pattern, file) + if match: + hp_keys.append(match.group(1)) + + step = None for key in hp_keys: - ckpt_file = checkpoint_files[key] + ckpt_file = os.path.join(folder, f"{key}.pt") ckpt_dict = torch.load(ckpt_file) + + if key == "step": + step = ckpt_dict + continue + full_hp_param = ckpt_dict[PARAM] # need to deal with slices that were averaged. @@ -62,7 +72,6 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): assert full_param_numel == tp_world_size * tp_slice_numel, \ f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}' - dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(key) # print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}") # print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}") @@ -84,13 +93,23 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): lp_frag_address = hp_mapping.lp_fragment_address tp_hp_fragment = tp_hp_slice.narrow(0, lp_frag_address.start, lp_frag_address.numel) - assert dst_tensor.numel() == lp_frag_address.numel, \ - f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}' # print(f"{key} SHAPE: {tp_hp_slice.shape=}") # print(f"{key} SHAPE: {dst_tensor.shape=}") # print(f"{key} SHAPE: {tp_hp_fragment.shape=}") - dst_tensor.data.copy_(tp_hp_fragment.data) + + if key == FP32_WEIGHT_KEY: + dst_tensor = hp_mapping.get_hp_fragment() + assert dst_tensor.numel() == lp_frag_address.numel, \ + f'Load checkpoint {key} dst numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}' + dst_tensor.data.copy_(tp_hp_fragment.data) + else: + assert tp_hp_fragment.numel() == lp_frag_address.numel, \ + f'Load checkpoint {key} dst numel {tp_hp_fragment.numel()} != src numel {lp_frag_address.numel}' + + hp_mapping.optim_fragment[key] = tp_hp_fragment.clone().detach() + + return step def enable_universal_checkpoint(param_list): diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py index c65745d3dd0c..6730b93dfd4f 100644 --- a/deepspeed/checkpoint/zero_checkpoint.py +++ b/deepspeed/checkpoint/zero_checkpoint.py @@ -105,9 +105,11 @@ def _strip_tensor_paddings(self, sd): if group_paddings[key] == 0: continue for state_name, state_value in group_state.items(): - if torch.is_tensor(state_value): + if state_name != "step" and torch.is_tensor(state_value): raw_length = state_value.numel() - group_paddings[key] group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone() + else: + group_state[state_name] = state_value def _clear_group_paddings(self, sd): group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index d96c63d84e60..35c31f9fe2ec 100644 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -9,6 +9,7 @@ from .backend import * from .comm import * from ..runtime import compiler +from deepspeed.utils.torch import required_torch_version import os DS_COMM_ALL_GATHER_OFF = False @@ -18,40 +19,19 @@ DS_COMM_REDUCE_OFF = False -def is_torch_ver_eq_2_0(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR == 2 and TORCH_MINOR == 0: - return True - return False - - -def is_torch_ver_ge_2_1(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1: - return True - return False - - -def torch_ver_ge_1_13(): - TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2]) - if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13: - return True - return False - - def has_coalescing_manager(): has_c10d = hasattr(torch.distributed, 'distributed_c10d') return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager') def has_all_reduce_coalesced(): - return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13() + return hasattr(torch.distributed, "all_reduce_coalesced") and required_torch_version(min_version=1.13) def get_coalescing_manager(group, device, reqs, async_op): - if is_torch_ver_eq_2_0(): + if required_torch_version(min_version=2.0, max_version=2.0): return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs) - elif is_torch_ver_ge_2_1(): + elif required_torch_version(min_version=2.1): return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op) else: return torch.distributed.distributed_c10d._coalescing_manager(group, reqs) diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py index c1e8932ecaba..6051529bfa24 100644 --- a/deepspeed/elasticity/elastic_agent.py +++ b/deepspeed/elasticity/elastic_agent.py @@ -6,7 +6,7 @@ from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent from typing import Any, Dict, Optional, Tuple from datetime import datetime -from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port +from torch.distributed.elastic.agent.server.api import _get_socket_with_port from torch.distributed.elastic.metrics import put_metric from torch.distributed.elastic.agent.server.api import ( RunResult, @@ -24,6 +24,10 @@ from contextlib import closing import subprocess +from torch.distributed.elastic.utils.logging import get_logger + +log = get_logger(__name__) + class DSElasticAgent(LocalElasticAgent): diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py index 848ea8646449..78ae0352cf6e 100644 --- a/deepspeed/elasticity/utils.py +++ b/deepspeed/elasticity/utils.py @@ -3,7 +3,7 @@ # DeepSpeed Team -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version def is_torch_elastic_compatible(): diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py index 912262453748..85a2f9b2b6c0 100644 --- a/deepspeed/env_report.py +++ b/deepspeed/env_report.py @@ -9,7 +9,7 @@ import subprocess import argparse from .ops.op_builder.all_ops import ALL_OPS -from .git_version_info import installed_ops, torch_info +from .git_version_info import installed_ops, torch_info, accelerator_name from deepspeed.accelerator import get_accelerator GREEN = '\033[92m' @@ -51,7 +51,8 @@ def op_report(verbose=True): for op_name, builder in ALL_OPS.items(): dots = "." * (max_dots - len(op_name)) is_compatible = OKAY if builder.is_compatible(verbose) else no - is_installed = installed if installed_ops.get(op_name, False) else no + is_installed = installed if installed_ops.get(op_name, + False) and accelerator_name == get_accelerator()._name else no dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len)) print(op_name, dots, is_installed, dots2, is_compatible) print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1]))) diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py index 635842c760ea..70c536d2f78e 100644 --- a/deepspeed/git_version_info.py +++ b/deepspeed/git_version_info.py @@ -18,5 +18,14 @@ from .ops.op_builder.all_ops import ALL_OPS installed_ops = dict.fromkeys(ALL_OPS.keys(), False) - compatible_ops = dict.fromkeys(ALL_OPS.keys(), False) + accelerator_name = "" torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"} + +# compatible_ops list is recreated for each launch +from .ops.op_builder.all_ops import ALL_OPS + +compatible_ops = dict.fromkeys(ALL_OPS.keys(), False) +for op_name, builder in ALL_OPS.items(): + op_compatible = builder.is_compatible() + compatible_ops[op_name] = op_compatible + compatible_ops["deepspeed_not_implemented"] = False diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h index 450991b3c387..c02cc76905e0 100644 --- a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h +++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h @@ -13,7 +13,7 @@ #endif #include #include -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ #include #endif #include @@ -33,7 +33,7 @@ class BlasContext { std::cerr << message << std::endl; throw std::runtime_error(message); } -#ifndef __HIP_PLATFORM_HCC__ +#ifndef __HIP_PLATFORM_AMD__ cublasSetMathMode(_handle, CUBLAS_TENSOR_OP_MATH); #endif } @@ -55,7 +55,7 @@ class BlasContext { enum class BlasType { FP32, FP16, BF16 }; -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_operation get_trans_op(bool do_trans) { return (do_trans) ? rocblas_operation_transpose : rocblas_operation_none; @@ -99,7 +99,7 @@ int blas_gemm_ex(void* C, const float* beta, BlasType type) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_operation_t transa_op = get_trans_op(transa); rocblas_operation_t transb_op = get_trans_op(transb); @@ -155,7 +155,7 @@ int blas_gemm_ex(void* C, CUBLAS_GEMM_DEFAULT_TENSOR_OP); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { @@ -190,7 +190,7 @@ int blas_strided_batched_gemm(void* C, int batch, BlasType type) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ rocblas_operation_t transa_op = get_trans_op(transa); rocblas_operation_t transb_op = get_trans_op(transb); @@ -257,7 +257,7 @@ int blas_strided_batched_gemm(void* C, CUBLAS_GEMM_DEFAULT_TENSOR_OP); #endif -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ if (status != rocblas_status_success) { #else if (status != CUBLAS_STATUS_SUCCESS) { diff --git a/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp index 8a29dd2d5945..ce115f993c3c 100644 --- a/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp +++ b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp @@ -23,7 +23,7 @@ copies. */ torch::Tensor allocate_fast_host_buffer(torch::Tensor device_mirror) { -#ifdef __HIP_PLATFORM_HCC__ +#ifdef __HIP_PLATFORM_AMD__ auto options = torch::TensorOptions().device(torch::kCPU).pinned_memory(true).dtype(device_mirror.dtype()); auto buffer = torch::empty(device_mirror.sizes(), options); diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py index 44e694952ffe..ce58deadc281 100644 --- a/deepspeed/launcher/multinode_runner.py +++ b/deepspeed/launcher/multinode_runner.py @@ -74,7 +74,8 @@ def name(self): def get_cmd(self, environment, active_resources): environment['PDSH_RCMD_TYPE'] = 'ssh' if self.args.ssh_port is not None: # only specify ssh port if it is specified - environment["PDSH_SSH_ARGS_APPEND"] += f" -p {self.args.ssh_port}" + environment["PDSH_SSH_ARGS_APPEND"] = f"{environment.get('PDSH_SSH_ARGS_APPEND', '')} \ + -p {self.args.ssh_port}" active_workers = ",".join(active_resources.keys()) logger.info("Running on the following workers: %s" % active_workers) diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index bf9c2d74c635..88f7086518e8 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -133,7 +133,7 @@ def is_load_module(module): load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm] load_layer_names = [ "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear", - "MistralRMSNorm", "T5LayerNorm" + "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm" ] return module.__class__ in load_layers or module._get_name() in load_layer_names @@ -303,6 +303,9 @@ def tp_parser(model): elif 'self_attention.dense' in layer and 'falcon' in str( type(module)): # this is a hack to get the right linear layer for this model! gem_list = gem_list + [layer] + # Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce. + elif 'w2' in layer and 'Mixtral' in str(type(module)): + gem_list = gem_list + [layer] layer_list = [] if gem_list != []: @@ -322,6 +325,9 @@ def _replace(self, child, name, conv_linear_layer): return weight_shape = child.weight.shape mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group) + # For mixtral-7x8b, need to skip MoE gate linear replace. + if name == "block_sparse_moe.gate": + return child if name in self.all_reduce_linears: # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size] # else [weight_shape[0], weight_shape[1] // mp_size] diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py index 05f30eec8d85..a78ac8120346 100644 --- a/deepspeed/module_inject/containers/bloom.py +++ b/deepspeed/module_inject/containers/bloom.py @@ -23,12 +23,14 @@ def __init__(self, **kwargs): # All model specific things should be defined here instead of the base class. self.bigscience_bloom = True + self.triangular_masking = False def create_module(self, config=None): _config = config if config is not None else self.ds_model_config self.module = DeepSpeedBloomInference(_config, mp_group=self.mp_group) self.module.config.scale_attention = self.scale_attention + self.module.config.invert_mask = False return self.module def attention_qkv_mp(self, mp_replace, reversed_dim=False): diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py index dfa9fcf4f464..6777788ab885 100644 --- a/deepspeed/moe/layer.py +++ b/deepspeed/moe/layer.py @@ -71,7 +71,7 @@ def __init__(self, experts = Experts(expert, self.num_local_experts, self.expert_group_name) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, - min_capacity, noisy_gate_policy, drop_tokens, use_rts, + min_capacity, noisy_gate_policy, drop_tokens, use_rts, None, top2_2nd_expert_sampling), experts, self.expert_group_name, diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py index 6c501ea6503a..b8a06274343a 100644 --- a/deepspeed/moe/mappings.py +++ b/deepspeed/moe/mappings.py @@ -23,6 +23,8 @@ import torch import deepspeed +from deepspeed.utils.bwc import (bwc_tensor_model_parallel_world_size, bwc_tensor_model_parallel_rank, + bwc_tensor_model_parallel_group) def _gather_tokens(input_, dim=0): @@ -31,11 +33,11 @@ def _gather_tokens(input_, dim=0): input_ = input_.contiguous() # Size and dimension. - rank = mpu.get_tensor_model_parallel_rank() + rank = bwc_tensor_model_parallel_rank(mpu) - tensor_list = [torch.empty_like(input_) for _ in range(mpu.get_tensor_model_parallel_world_size())] + tensor_list = [torch.empty_like(input_) for _ in range(bwc_tensor_model_parallel_world_size(mpu))] tensor_list[rank] = input_ - deepspeed.comm.all_gather(tensor_list, input_, group=mpu.get_tensor_model_parallel_group()) + deepspeed.comm.all_gather(tensor_list, input_, group=bwc_tensor_model_parallel_group(mpu)) # Note: torch.cat already creates a contiguous tensor. output = torch.cat(tensor_list, dim=dim).contiguous() @@ -47,8 +49,8 @@ def _drop_tokens(input_, dim=0): """Divide a tensor among the tensor parallel ranks""" mpu = deepspeed.utils.groups.mpu - total_chunks = mpu.get_tensor_model_parallel_world_size() - this_chunk = mpu.get_tensor_model_parallel_rank() + total_chunks = bwc_tensor_model_parallel_world_size(mpu) + this_chunk = bwc_tensor_model_parallel_rank(mpu) assert input_.shape[ dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})" chunk_size = input_.shape[dim] // total_chunks @@ -92,7 +94,7 @@ def backward(ctx, input_): def gather_tokens(input_, dim=0): mpu = deepspeed.utils.groups.mpu - if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1: + if mpu is None or bwc_tensor_model_parallel_world_size(mpu) == 1: # no tensor parallelism for non-experts return input_ return _GatherTokens.apply(input_, dim) @@ -100,7 +102,7 @@ def gather_tokens(input_, dim=0): def drop_tokens(input_, dim=0): mpu = deepspeed.utils.groups.mpu - if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1: + if mpu is None or bwc_tensor_model_parallel_world_size(mpu) == 1: # no tensor parallelism for non-experts return input_ return _DropTokens.apply(input_, dim) diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index d6c023ec11d3..bd2782279c01 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -17,7 +17,8 @@ from deepspeed.utils.timer import SynchronizedWallClockTimer from deepspeed.utils import logger -from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple +from deepspeed.utils.bwc import bwc_tensor_model_parallel_world_size +from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple, Union import torch from torch import Tensor @@ -184,6 +185,7 @@ def top1gating(logits: Tensor, noisy_gate_policy: Optional[str] = None, drop_tokens: bool = True, use_rts: bool = True, + ep_group: Union[torch.distributed.ProcessGroup, None] = None, use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top1Gating on logits.""" if noisy_gate_policy == 'RSample': @@ -209,13 +211,16 @@ def top1gating(logits: Tensor, # if we don't want to drop any tokens if not drop_tokens: new_capacity = torch.max(exp_counts).to(logits.device) - dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group()) + # Communicate across expert processes to pick the maximum capacity. + if ep_group is not None: + dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group) if groups._get_expert_model_parallel_world_size() == 1: # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'. # This is since we are going to activate drop_tokens() to drop duplicate tokens. - tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size() + tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu) new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype) - capacity = new_capacity + # Make sure the capacity value does not exceed the number of tokens. + capacity = min(new_capacity, torch.tensor(mask1.size(0))) # Compute l_aux me = torch.mean(gates, dim=0) @@ -284,6 +289,7 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int, drop_tokens: bool = True, + ep_group: Union[torch.distributed.ProcessGroup, None] = None, top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """Implements Top2Gating on logits.""" # everything is in fp32 in this function @@ -326,11 +332,12 @@ def top2gating(logits: Tensor, else: # Do not drop tokens - set capacity according to current expert assignments new_capacity = torch.max(exp_counts) - dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group()) + if ep_group is not None: + dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group) if groups._get_expert_model_parallel_world_size() == 1: # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'. # This is since we are going to activate drop_tokens() to drop duplicate tokens. - tp = 1 if groups.mpu is None else groups.mpu.get_tensor_model_parallel_world_size() + tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu) new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype) capacity = new_capacity @@ -374,7 +381,7 @@ class TopKGate(Module): Args: model_dim (int): size of model embedding dimension - num_experts (ints): + num_experts (int): number of experts in model """ @@ -390,6 +397,7 @@ def __init__(self, noisy_gate_policy: Optional[str] = None, drop_tokens: bool = True, use_rts: bool = True, + ep_group: Union[torch.distributed.ProcessGroup, None] = None, top2_2nd_expert_sampling: bool = True) -> None: super().__init__() @@ -397,6 +405,7 @@ def __init__(self, if k != 1 and k != 2: raise ValueError('Only top-1 and top-2 gatings are supported.') self.wg = torch.nn.Linear(model_dim, num_experts, bias=False) + self.ep_group = ep_group self.k = k self.capacity_factor = capacity_factor self.eval_capacity_factor = eval_capacity_factor @@ -409,6 +418,10 @@ def __init__(self, self.use_rts = use_rts self.top2_2nd_expert_sampling = top2_2nd_expert_sampling + def _set_ep_group(self, ep_group): + assert self.ep_group is None, f'Attempting to override an existing ep_group' + self.ep_group = ep_group + def forward(self, input: torch.Tensor, used_token: torch.Tensor = None, @@ -426,11 +439,11 @@ def forward(self, if self.k == 1: gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, self.min_capacity, used_token, self.noisy_gate_policy if self.training else None, - self.drop_tokens, self.use_rts, use_tutel) + self.drop_tokens, self.use_rts, self.ep_group, use_tutel) else: gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor, - self.min_capacity, self.drop_tokens, self.top2_2nd_expert_sampling) + self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling) if self.wall_clock_breakdown: self.timers(TOPK_GATE_TIMER).stop() @@ -490,6 +503,7 @@ def __init__(self, def _set_ep_group(self, ep_group): self.ep_group = ep_group + self.gate._set_ep_group(ep_group) def forward(self, *input: Tensor, **kwargs: Any) -> Tensor: diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py index 8e1faffc3541..20866378efac 100644 --- a/deepspeed/moe/utils.py +++ b/deepspeed/moe/utils.py @@ -146,3 +146,37 @@ def split_params_into_different_moe_groups_for_optimizer( param_groups.append(param_group) return param_groups + + +def is_moe_param_group(param_group): + return param_group.get('moe', False) + + +def configure_moe_param_groups(model_parameters: List): + assert isinstance(model_parameters, list), "model_parameters must be a list" + + for p in model_parameters: + # match torch.optim.Optimizer expectations, + # see: https://github.com/pytorch/pytorch/blob/2ffab6e663b9c6951048b8c8ba82d2cc5ca5c2fc/torch/optim/optimizer.py#L270-L272 + if not isinstance(p, (torch.Tensor, dict)): + raise TypeError("param argument that would be given to the optimizer should be " + f"an iterable of Tensors or dicts, but got {type(p)}") + + # peak at the first element to determine how to proceed + first = model_parameters[0] + + # Case 1: model_parameters is a list of torch.nn.Parameter + # -> need to create moe compatible param groups + if isinstance(first, torch.nn.Parameter): + param_group = {'params': model_parameters, 'name': 'dense-params'} + return split_params_into_different_moe_groups_for_optimizer(param_group) + + # Case 2: model_parameters is a list of param groups List[dict] + # -> moe compatible param groups might already exist, if not create them + elif isinstance(first, dict): + #there are no moe groups created + if not any(['moe' in param_group for param_group in model_parameters]): + return split_params_into_different_moe_groups_for_optimizer(model_parameters) + else: + # moe groups exist, nothing to do + return model_parameters diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py index ba1c9c1fd9f0..7ea5ce5af19e 100755 --- a/deepspeed/ops/__init__.py +++ b/deepspeed/ops/__init__.py @@ -7,8 +7,6 @@ from . import adagrad from . import lamb from . import lion -#from ..git_version_info_installed import installed_ops as __installed_ops__ -#if __installed_ops__['sparse_attn']: from . import sparse_attention from . import transformer diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py new file mode 100644 index 000000000000..5575f3567185 --- /dev/null +++ b/deepspeed/ops/fp_quantizer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .quantize import FP_Quantize diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py new file mode 100644 index 000000000000..5dc3c190ae5d --- /dev/null +++ b/deepspeed/ops/fp_quantizer/quantize.py @@ -0,0 +1,79 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch + +from deepspeed.ops.op_builder import FPQuantizerBuilder + +fp_quant_module = None + + +class FP_Quantize: + + def __init__(self, group_size=512) -> None: + global fp_quant_module + if fp_quant_module is None: + fp_quant_module = FPQuantizerBuilder().load() + + self.group_size = group_size + self.orig_dtype = None + + def quantize(self, + input, + q_bits=8, + q_mantisa_bits=3, + stochastic_mode=False, + return_meta_tensor=False) -> torch.Tensor: + assert input.dtype == torch.bfloat16, "only support bf16 for now" + if return_meta_tensor: + assert q_bits == 8, "meta tensor is only supported with q_bit=8" + + self.orig_dtype = input.dtype + self.orig_shape = input.shape + + if q_bits == 8: + pass + elif q_bits == 12: + q_mantisa_bits = 4 + elif q_bits == 6: + q_mantisa_bits = 2 + elif q_bits == 4: + q_mantisa_bits = 1 + else: + assert (0), \ + f"Missing {q_bits}-quantization, please add the template arguments for the kernel to support this precision!" + + out = fp_quant_module.quantize(input, self.group_size, stochastic_mode, q_bits, q_mantisa_bits) + + if return_meta_tensor: + data, scale = out.split(self.group_size, dim=-1) + return data.contiguous().reshape(input.shape), scale.contiguous() + + return out + + def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor: + assert (self.orig_dtype is not None), \ + "[De-quantization Error]: you need to call quantize before dequantizing!" + fp_out = torch.empty(self.orig_shape, dtype=self.orig_dtype, + device=input_q.device) if fp_out is None else fp_out + if q_bits == 8: + pass + elif q_bits == 12: + q_mantisa_bits = 4 + elif q_bits == 6: + q_mantisa_bits = 2 + elif q_bits == 4: + q_mantisa_bits = 1 + else: + assert (0), \ + f"Missing {q_bits}-dequantization, please add the template arguments for the kernel to support this precision!" + + if scale is not None: + assert input_q.numel() == fp_out.numel(), \ + f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!' + input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous() + + fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1) + return fp_out diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py index d5aff4f541f7..9709328cc133 100644 --- a/deepspeed/ops/transformer/inference/config.py +++ b/deepspeed/ops/transformer/inference/config.py @@ -43,6 +43,7 @@ class DeepSpeedInferenceConfig(TransformerConfig): return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture. use_triton: This flag is to enable triton kernels in inference or not. + invert_mask: If True, the attention mask is inverted when passed to attention block. """ def __init__(self, @@ -80,7 +81,8 @@ def __init__(self, use_triton=False, triton_autotune=False, num_kv=-1, - rope_theta=10000): + rope_theta=10000, + invert_mask=True): super(DeepSpeedInferenceConfig, self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads, num_hidden_layers) @@ -116,6 +118,7 @@ def __init__(self, self.triton_autotune = triton_autotune self.num_kv = num_kv self.rope_theta = rope_theta + self.invert_mask = invert_mask @classmethod def from_dict(cls, json_object): diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py index eb6ce2f75c69..56cf3c7b6a2c 100644 --- a/deepspeed/ops/transformer/inference/ds_attention.py +++ b/deepspeed/ops/transformer/inference/ds_attention.py @@ -254,8 +254,12 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi): if input_mask.dtype == torch.bool: input_mask = input_mask.long() + # Invert input_mask per transformer implementation (eg, in BLOOM, it's already inverted) + if self.config.invert_mask: + input_mask = 1 - input_mask + attention_probs = self.softmax_func(attn_scores=attention_scores, - attn_mask=((1 - input_mask).to(target_dtype) * minus_inf), + attn_mask=input_mask.to(target_dtype) * minus_inf, alibi=alibi, triangular=(self.config.triangular_masking and (attention_scores.shape[-2] > 1)), diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py index 90bfcae81bf2..8766b65e866d 100644 --- a/deepspeed/ops/transformer/inference/moe_inference.py +++ b/deepspeed/ops/transformer/inference/moe_inference.py @@ -226,7 +226,7 @@ def __init__(self, self.moe_gate = TopKGate(self.config.hidden_size, self.config.global_experts, self.config.k, self.config.capacity_factor, self.config.eval_capacity_factor, self.config.min_capacity, self.config.noisy_gate_policy, self.config.drop_tokens, - self.config.use_rts) + self.config.use_rts, self.ep_group) self.ep_group = ep_group self.mp_group = mp_group diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py index d6f72b4efb0b..c77d8a8e11c0 100644 --- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py +++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py @@ -13,12 +13,41 @@ import deepspeed from pathlib import Path import atexit +import subprocess # ----------------------------------------------------------------------------- # util class/functions for triton -def _default_cache_dir(): - return os.path.join(Path.home(), ".triton", "autotune") +def is_nfs_path(path): + # Normalize the path to get the absolute path + path = os.path.abspath(path) + + # Use the 'df' command to find the file system type for the given path + try: + output = subprocess.check_output(['df', '-T', path], encoding='utf-8') + except subprocess.CalledProcessError: + return False # Command failed + + # Process the output of 'df -T' to check for 'nfs' in the filesystem type column + lines = output.strip().split('\n') + if len(lines) > 1: # The first line is headers + fs_type = lines[1].split()[1].lower() # File system type is the second column + return 'nfs' in fs_type + return False + + +class TritonCacheDir: + _warning_printed = False + + @staticmethod + def default_cache_dir(): + tmp_path = os.path.join(Path.home(), ".triton", "autotune") + if is_nfs_path(tmp_path) and not TritonCacheDir._warning_printed: + print( + f"Warning: The default cache directory for DeepSpeed Triton autotune, {tmp_path}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path." + ) + TritonCacheDir._warning_printed = True + return tmp_path def bias_add_activation(C, bias=None, activation=""): @@ -50,7 +79,7 @@ def __init__(self, key): self.file_path = None self.lock_path = None # if caching is enabled, get the lock and bin path - self.cache_dir = os.environ.get('TRITON_CACHE_DIR', _default_cache_dir()) + self.cache_dir = os.environ.get('TRITON_CACHE_DIR', TritonCacheDir.default_cache_dir()) if self.cache_dir: os.makedirs(self.cache_dir, exist_ok=True) if self.cache_dir: diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py index 347ff7993d82..208299fb8c50 100644 --- a/deepspeed/runtime/__init__.py +++ b/deepspeed/runtime/__init__.py @@ -2,11 +2,3 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team - - -class DeepSpeedOptimizer(object): - pass - - -class ZeROOptimizer(DeepSpeedOptimizer): - pass diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index 02e0b197e927..2a21cf7ca17a 100644 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -25,8 +25,9 @@ from deepspeed.runtime.config import DeepSpeedConfig from deepspeed.utils import logger -from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bwc_tensor_model_parallel_rank +from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers, FORWARD_GLOBAL_TIMER +from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank from deepspeed.accelerator import get_accelerator # DeepSpeed Checkpointing Enabled or Disabled diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py new file mode 100644 index 000000000000..6cfd66f1cc38 --- /dev/null +++ b/deepspeed/runtime/base_optimizer.py @@ -0,0 +1,63 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import os +import torch + +from deepspeed.utils import logger +from deepspeed.utils.tensor_fragment import map_to_flat_opt_states +from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank + + +class DeepSpeedOptimizer(object): + pass + + +class ZeROOptimizer(DeepSpeedOptimizer): + + def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, checkpoint_dir: str) -> None: + checkpoint_dir = os.path.join(checkpoint_dir, "zero") + optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt") + assert os.path.isfile( + optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.' + optim_sd = torch.load(optim_state_path) + + self._load_global_state(optim_sd) + + tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu) + if self.mpu is None: + logger.warn("MPU is not provided, setting tp size to 1 in checkpoint loading.") + tp_world_size = 1 + else: + tp_world_size = self.mpu.get_slice_parallel_world_size() if hasattr(self.mpu, "get_slice_parallel_world_size") \ + else self.mpu.get_tensor_model_parallel_world_size() + + for i, (param_group, + loaded_param_group) in enumerate(zip(self.optimizer.param_groups, optim_sd['param_groups'])): + # We have an assumption that all params in the same param_group have the same keys + opt_keys = set() + steps = [] + + lp_groups = getattr(self, lp_groups_name) + for lp in lp_groups[i]: + if lp._hp_mapping is not None: + #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}") + step = lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank, + tp_world_size) + for key in lp._hp_mapping.get_optim_state_keys(): + opt_keys.add(key) + steps.append(step) + + hp_param = param_group['params'][0] + assert all(step == steps[0] for step in steps), f"Steps {steps} are not equal" + if steps[0] is not None: + self.optimizer.state[hp_param]['step'] = steps[0] + + map_to_flat_opt_states(hp_param, lp_groups[i], self.optimizer.state, opt_keys) + + for key, value in loaded_param_group.items(): + if key == 'params': + continue + param_group[key] = value diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py index aaa836bf1c31..f970e582b354 100644 --- a/deepspeed/runtime/bf16_optimizer.py +++ b/deepspeed/runtime/bf16_optimizer.py @@ -6,19 +6,18 @@ from collections import OrderedDict import torch import sys -import os from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed import comm as dist from deepspeed.runtime.constants import PIPE_REPLICATED -from deepspeed.runtime import ZeROOptimizer +from deepspeed.runtime.base_optimizer import ZeROOptimizer from packaging import version as pkg_version - from deepspeed.git_version_info import version from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim, - align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank, - is_model_parallel_parameter, see_memory_usage, graph_process) - -from deepspeed.utils import link_hp_params, lazy_init_hp_params_optimizer_state, fragment_address + align_dense_tensors, all_gather_dp_groups, is_model_parallel_parameter, + see_memory_usage, graph_process, get_norm_with_moe_layers) +from deepspeed.utils import link_hp_params, lazy_init_hp_params_optimizer_state, fragment_address, groups +from deepspeed.moe.utils import is_moe_param, is_moe_param_group +from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank from deepspeed.checkpoint import enable_universal_checkpoint from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE, SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS, @@ -40,7 +39,8 @@ def __init__(self, timers=None, grad_acc_dtype=None, graph_harvesting=False, - immediate_grad_update=False): + immediate_grad_update=False, + has_moe_layers=False): super().__init__() see_memory_usage('begin bf16_optimizer', force=True) self.timers = timers @@ -59,7 +59,11 @@ def __init__(self, self.allgather_bucket_size = int(allgather_bucket_size) self.dp_process_group = dp_process_group self.dp_rank = dist.get_rank(group=self.dp_process_group) + self.has_moe_layers = has_moe_layers + self.non_expert_gradients = [] self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))] + if self.has_moe_layers: + self._configure_moe_settings() # Use torch (un)flatten ops self.flatten = _flatten_dense_tensors @@ -90,11 +94,26 @@ def __init__(self, see_memory_usage('end bf16_optimizer', force=True) + def _configure_moe_settings(self): + assert any( + [is_moe_param_group(group) for group in self.optimizer.param_groups] + ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer" + + for i, group in enumerate(self.optimizer.param_groups): + if is_moe_param_group(group): + assert all([is_moe_param(param) + for param in group['params']]), "All params in MoE group must be MoE params" + self.real_dp_process_group[i] = groups._get_expert_data_parallel_group(group['name']) + self.expert_gradients = {} + if self.has_moe_layers: + for key in groups._get_expert_data_parallel_group_dict().keys(): + self.expert_gradients[key] = [] + def _setup_for_real_optimizer(self): - dp_world_size = dist.get_world_size(group=self.dp_process_group) - self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))] + self.partition_count = [dist.get_world_size(group=pg) for pg in self.real_dp_process_group] for i, param_group in enumerate(self.optimizer.param_groups): + real_dp_world_size = dist.get_world_size(group=self.real_dp_process_group[i]) see_memory_usage(f'before initializing group {i}', force=True) partition_id = dist.get_rank(group=self.real_dp_process_group[i]) @@ -106,17 +125,16 @@ def _setup_for_real_optimizer(self): # create flat bf16 params self.bf16_groups_flat.append( self._flatten_dense_tensors_aligned(self.bf16_groups[i], - self.nccl_start_alignment_factor * dp_world_size)) - + self.nccl_start_alignment_factor * real_dp_world_size)) # Make bf16 params point to flat tensor storage self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i], flat_tensor=self.bf16_groups_flat[i]) # divide flat weights into equal sized partitions - partition_size = self.bf16_groups_flat[i].numel() // dp_world_size + partition_size = self.bf16_groups_flat[i].numel() // real_dp_world_size bf16_dp_partitions = [ self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size) - for dp_index in range(dp_world_size) + for dp_index in range(real_dp_world_size) ] self.bf16_partitioned_groups.append(bf16_dp_partitions) @@ -127,8 +145,12 @@ def _setup_for_real_optimizer(self): num_elem_list = [t.numel() for t in self.bf16_groups[i]] # create fp32 gradients - self.fp32_groups_gradients_flat.append( - torch.zeros_like(self.bf16_groups_flat[i], dtype=self.grad_acc_dtype)) + fp32_flat_buffer = torch.zeros_like(self.bf16_groups_flat[i], dtype=self.grad_acc_dtype) + self.fp32_groups_gradients_flat.append(fp32_flat_buffer) + if self.has_moe_layers and is_moe_param_group(param_group): + self.expert_gradients[param_group['name']].append(fp32_flat_buffer) + else: + self.non_expert_gradients.append(fp32_flat_buffer) # track individual fp32 gradients for entire model fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i], @@ -191,11 +213,12 @@ def _create_param_mapping(self): return param_mapping def _link_all_hp_params(self): - dp_world_size = dist.get_world_size(group=self.dp_process_group) for i, _ in enumerate(self.optimizer.param_groups): + real_dp_world_size = dist.get_world_size(group=self.real_dp_process_group[i]) + # Link bf16 and fp32 params in partition partition_id = dist.get_rank(group=self.real_dp_process_group[i]) - partition_size = self.bf16_groups_flat[i].numel() // dp_world_size + partition_size = self.bf16_groups_flat[i].numel() // real_dp_world_size flat_hp_partition = self.fp32_groups_flat_partition[i] link_hp_params(lp_param_list=self.bf16_groups[i], flat_hp_partition=flat_hp_partition, @@ -257,10 +280,18 @@ def step(self, closure=None): if closure is not None: raise NotImplementedError(f'{self.__class__} does not support closure.') - all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(), - mpu=self.mpu, - norm_type=self.norm_type, - use_graph=self.graph_harvesting) + non_expert_grads_for_norm, expert_grads_for_norm = self.get_grads_for_norm() + non_expert_groups_norm = get_global_norm_of_tensors(input_tensors=non_expert_grads_for_norm, + mpu=self.mpu, + norm_type=self.norm_type, + use_graph=self.graph_harvesting) + all_groups_norm = non_expert_groups_norm + if self.has_moe_layers: + all_groups_norm = get_norm_with_moe_layers(non_expert_groups_norm, + mpu=self.mpu, + expert_tensors=expert_grads_for_norm, + norm_type=self.norm_type) + self._global_grad_norm = all_groups_norm assert all_groups_norm > 0. @@ -336,27 +367,55 @@ def update_hp_grads(self, clear_lp_grads=False): @torch.no_grad() def get_grads_for_reduction(self): - return self.fp32_groups_gradients_flat + if self.has_moe_layers: + return self.non_expert_gradients, self.expert_gradients + return self.non_expert_gradients, {} @torch.no_grad() def get_grads_for_norm(self, for_clipping=False): - grads = [] + """ + Returns: + tuple[list[Tensor], dict[ep_name, List[Tensor]] | list: + If for_clipping, return all gradients. + Otherwise, separate and return dict of expert_grad and list of non_expert_grad + """ + # (grads, expert_group_name) + expert_grads_for_norm = {} + + # grads + non_expert_grads_for_norm = [] + all_grads_for_clip = [] + tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu) + assert len(self.bf16_groups) == len(self.optimizer.param_groups) for i, group in enumerate(self.bf16_groups): for j, lp in enumerate(group): if not for_clipping: if hasattr(lp, PIPE_REPLICATED) and lp.ds_pipe_replicated: continue - if not (tensor_mp_rank == 0 or is_model_parallel_parameter(lp)): + # skip duplicated parameters. perform norm only on cards with tp_rank=0. + # non-duplicated parameters include: + # - Parameters with tp: Use allreducesum of mp_group. + # - Moe Parameters with ep: Use allreducesum of ep_group. + if not (tensor_mp_rank == 0 or is_model_parallel_parameter(lp) or is_moe_param(lp)): continue if not self.fp32_groups_has_gradients[i][j]: continue - - grads.append(self.fp32_groups_gradients[i][j]) - - return grads + if not for_clipping: + param_group = self.optimizer.param_groups[i] + if self.has_moe_layers and is_moe_param_group(param_group): + if param_group['name'] not in expert_grads_for_norm: + expert_grads_for_norm[param_group['name']] = [] + expert_grads_for_norm[param_group['name']].append(self.fp32_groups_gradients[i][j]) + else: + non_expert_grads_for_norm.append(self.fp32_groups_gradients[i][j]) + else: + all_grads_for_clip.append(self.fp32_groups_gradients[i][j]) + if not for_clipping: + return non_expert_grads_for_norm, expert_grads_for_norm + return all_grads_for_clip @torch.no_grad() def update_lp_params(self): @@ -433,6 +492,7 @@ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, l self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad) if load_optimizer_states: + print(f"_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]") self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE]) if load_from_fp32_weights: @@ -445,28 +505,19 @@ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, l self._link_all_hp_params() def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights): - self._load_hp_checkpoint_state(checkpoint_folder) + self.load_hp_checkpoint_state_from_checkpoint_dir("bf16_groups", checkpoint_folder) + + def _load_global_state(self, sd): + pass @property def param_groups(self): """Forward the wrapped optimizer's parameters.""" return self.optimizer.param_groups - def _load_hp_checkpoint_state(self, checkpoint_dir): - checkpoint_dir = os.path.join(checkpoint_dir, "zero") - tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu) - tp_world_size = self.mpu.get_slice_parallel_world_size() - - for i, _ in enumerate(self.optimizer.param_groups): - for lp in self.bf16_groups[i]: - if lp._hp_mapping is not None: - #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}") - lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank, - tp_world_size) - def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx): assert self.immediate_grad_update - self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=False) + self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True) def create_grad_acc_hooks(self): self.grad_accs = [] diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py index 231f841ae8b7..a57b7519a295 100644 --- a/deepspeed/runtime/comm/nccl.py +++ b/deepspeed/runtime/comm/nccl.py @@ -9,7 +9,7 @@ import numpy as np from deepspeed.runtime.compression.cupy import CupyBackend -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 975fb1f21501..19b169086be1 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -258,10 +258,10 @@ def get_communication_data_type(param_dict, return torch.float32 elif val == "fp16": return torch.float16 - elif val == "bfp16": + elif val == "bf16": return torch.bfloat16 - raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}") + raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bf16', 'fp32']. Got: {val}") def get_prescale_gradients(param_dict): diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 5c1202ba06ae..992d7877c179 100644 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -94,7 +94,7 @@ from ..ops.adam import FusedAdam from ..moe.sharded_moe import TopKGate, MOELayer from ..moe.layer import MoE -from ..moe.utils import is_moe_param +from ..moe.utils import is_moe_param, configure_moe_param_groups from ..git_version_info import version from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler @@ -1227,6 +1227,8 @@ def _do_optimizer_sanity_check(self, basic_optimizer): # Configure optimizer def _configure_optimizer(self, client_optimizer, model_parameters): if client_optimizer is None: + if self.has_moe_layers: + model_parameters = configure_moe_param_groups(model_parameters) basic_optimizer = self._configure_basic_optimizer(model_parameters) log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0]) else: @@ -1478,7 +1480,8 @@ def _configure_bf16_optimizer(self, optimizer): timers=timers, grad_acc_dtype=self.get_data_types()[1], graph_harvesting=self.graph_harvesting(), - immediate_grad_update=self._config.bfloat16_immediate_grad_update) + immediate_grad_update=self._config.bfloat16_immediate_grad_update, + has_moe_layers=self.has_moe_layers) return optimizer @@ -1924,9 +1927,6 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism) else: grads = None - if hasattr(self.optimizer, "get_grads_for_reduction"): - # This is currently for BF16 optimizer - grads = self.optimizer.get_grads_for_reduction() self.buffered_allreduce_fallback(grads=grads, elements_per_buffer=bucket_size) @instrument_w_nvtx @@ -2335,7 +2335,7 @@ def _report_progress(self, step): mom = self.get_mom() log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0]) - def allreduce_bucket(self, bucket, dp_group): + def allreduce_bucket(self, bucket, dp_group, dp_world_size=None): tensor = self.flatten(bucket) tensor_to_allreduce = tensor @@ -2343,16 +2343,18 @@ def allreduce_bucket(self, bucket, dp_group): if self.communication_data_type != tensor.dtype: tensor_to_allreduce = tensor.to(self.communication_data_type) + if dp_world_size is None: + dp_world_size = dist.get_world_size(group=dp_group) if self.postscale_gradients(): if self.gradient_predivide_factor() != 1.0: tensor_to_allreduce.mul_(1.0 / self.gradient_predivide_factor()) dist.all_reduce(tensor_to_allreduce, group=dp_group) if self.gradient_average: - if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group): - tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group)) + if self.gradient_predivide_factor() != dp_world_size: + tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dp_world_size) else: - tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group)) + tensor_to_allreduce.mul_(1. / dp_world_size) dist.all_reduce(tensor_to_allreduce, group=dp_group) if self.communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce: @@ -2360,23 +2362,23 @@ def allreduce_bucket(self, bucket, dp_group): return tensor - def allreduce_and_copy(self, small_bucket, dp_group): - allreduced = self.allreduce_bucket(small_bucket, dp_group) + def allreduce_and_copy(self, small_bucket, dp_group, dp_world_size=None): + allreduced = self.allreduce_bucket(small_bucket, dp_group, dp_world_size) for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)): buf.copy_(synced) - def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=500000000): + def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=500000000, dp_world_size=None): small_bucket = [] numel = 0 for tensor in bucket: small_bucket.append(tensor) numel = numel + tensor.numel() if numel > numel_per_bucket: - self.allreduce_and_copy(small_bucket, dp_group) + self.allreduce_and_copy(small_bucket, dp_group, dp_world_size) small_bucket = [] numel = 0 if len(small_bucket) > 0: - self.allreduce_and_copy(small_bucket, dp_group) + self.allreduce_and_copy(small_bucket, dp_group, dp_world_size) def _get_gradients_for_reduction(self): non_expert_grads = [] @@ -2427,26 +2429,35 @@ def _reduce_non_expert_gradients(self, grads, elements_per_buffer): self.allreduce_no_retain(dense_bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer) def _reduce_expert_gradients(self, expert_grads, elements_per_buffer): + # to maintain the gradients value unaffected by ep_size setting, + # utilize dp_world_size for allreduce average + dp_world_size = dist.get_world_size(groups._get_data_parallel_group()) for ep_name, expert_grads_group in expert_grads.items(): + ep_dp_group = groups._get_expert_data_parallel_group(ep_name) split_sparse_tensor_buckets, split_dense_tensor_buckets = split_half_float_double_sparse( expert_grads_group) for _, sparse_bucket_tuple in enumerate(split_sparse_tensor_buckets): if sparse_bucket_tuple: bucket_type, sparse_bucket = sparse_bucket_tuple - self.sparse_allreduce_no_retain(sparse_bucket, groups._get_expert_data_parallel_group(ep_name)) + self.sparse_allreduce_no_retain(sparse_bucket, dp_group=ep_dp_group, dp_world_size=dp_world_size) for _, dense_bucket_tuple in enumerate(split_dense_tensor_buckets): if dense_bucket_tuple: bucket_type, dense_bucket = dense_bucket_tuple # Separate between diff groups self.allreduce_no_retain(dense_bucket, - dp_group=groups._get_expert_data_parallel_group(ep_name), - numel_per_bucket=elements_per_buffer) + dp_group=ep_dp_group, + numel_per_bucket=elements_per_buffer, + dp_world_size=dp_world_size) def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000): if grads is None: - non_expert_grads, expert_grads = self._get_gradients_for_reduction() + if hasattr(self.optimizer, "get_grads_for_reduction"): + # This is currently for BF16 optimizer + non_expert_grads, expert_grads = self.optimizer.get_grads_for_reduction() + else: + non_expert_grads, expert_grads = self._get_gradients_for_reduction() else: assert not self.has_moe_layers, "attempting to reduce grads in unsupported way w.r.t. MoE" non_expert_grads = grads @@ -2456,8 +2467,8 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000) if self.has_moe_layers: self._reduce_expert_gradients(expert_grads, elements_per_buffer) - def sparse_allreduce_no_retain(self, bucket, dp_group): - allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group) + def sparse_allreduce_no_retain(self, bucket, dp_group, dp_world_size=None): + allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group, dp_world_size) # Densify sparse tensor and copy back to original location for tensor in allreduced_sparses: if tensor.is_sparse: @@ -2465,13 +2476,13 @@ def sparse_allreduce_no_retain(self, bucket, dp_group): else: tensor.orig_dense_tensor.copy_(tensor.to_dense()) - def sparse_allreduce_bucket(self, bucket, dp_group): + def sparse_allreduce_bucket(self, bucket, dp_group, dp_world_size=None): sparse_list = [] for sparse in bucket: - sparse_list.append(self.sparse_allreduce(sparse, dp_group)) + sparse_list.append(self.sparse_allreduce(sparse, dp_group, dp_world_size)) return sparse_list - def sparse_allreduce(self, sparse, dp_group): + def sparse_allreduce(self, sparse, dp_group, dp_world_size=None): original_data_type = sparse.values.dtype if self.communication_data_type != sparse.values.dtype: if self.communication_data_type in (torch.float16, torch.bfloat16): @@ -2483,12 +2494,13 @@ def sparse_allreduce(self, sparse, dp_group): indices = sparse.indices values = sparse.values + if dp_world_size is None: + dp_world_size = dist.get_world_size(group=dp_group) if self.postscale_gradients(): if self.gradient_average: - values.mul_(self.gradient_predivide_factor() / - (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size))) + values.mul_(self.gradient_predivide_factor() / (dp_world_size / float(self.sequence_parallel_size))) else: - values.mul_(1. / (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size))) + values.mul_(1. / (dp_world_size / float(self.sequence_parallel_size))) indices_device_list = self.sparse_all_gather(indices, dp_group) values_device_list = self.sparse_all_gather(values, dp_group) @@ -2759,7 +2771,7 @@ def load_checkpoint(self, load_zero_checkpoint = load_path is not None and (self.zero_optimization() or self.bfloat16_enabled()) if load_zero_checkpoint: - if load_optimizer_states and not load_module_only: + if (load_optimizer_states and not load_module_only) or self.load_universal_checkpoint(): success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states) else: success = False @@ -2784,8 +2796,6 @@ def load_checkpoint(self, if self.load_universal_checkpoint(): self.optimizer.update_lp_params() - if load_zero_checkpoint: - self.update_optimizer_step(step=client_states['iteration'] + 1) return load_path, client_states @@ -2963,24 +2973,6 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True): logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}") return True - def update_optimizer_step(self, step): - - def set_step(d): - if isinstance(d['step'], torch.Tensor): - d['step'] = torch.tensor(step, dtype=d['step'].dtype, device=d['step'].device) - else: - d['step'] = step - - optimizer = self.optimizer - base_optimizer = optimizer.optimizer - state = base_optimizer.state - for group in optimizer.param_groups: - if 'step' in group: - set_step(group) - for p in group['params']: - if p in state and len(state[p]) > 0 and 'step' in state[p]: - set_step(state[p]) - def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode): zero_ckpt_names = [] for dp_rank in range(dp_world_size): @@ -3236,9 +3228,12 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa # Load flow uses below saved file for model parameters, RNG and more if groups._get_data_parallel_rank() == 0: - # get non-moe parameters + # Get non-moe parameters + # Classes DeepSpeedEngine and PipelineEngine have different behavior for method module_state_dict. + # DeepSpeedEngine returns the state dict, where PipelineEngine saves the state dict and returns None. + # We need to get the state dict, therefore, call to DeepSpeedEngine (base class for PipelineEngine) model_state_dict = self._get_non_moe_state_dict( - self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters)) + DeepSpeedEngine.module_state_dict(self, exclude_frozen_parameters=exclude_frozen_parameters)) # TODO: update num experts info,.. in checkpoint state = { diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 182f806c839c..af8050c4646a 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -10,13 +10,14 @@ import torch from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from deepspeed.runtime import DeepSpeedOptimizer -from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version +from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer +from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE -from deepspeed.utils import groups, logger, log_dist -from deepspeed import comm as dist +from deepspeed.utils import logger, log_dist +from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD from deepspeed.accelerator import get_accelerator +from deepspeed.moe.utils import is_moe_param_group OVERFLOW_CHECK_TIMER = 'overflow_check' COMPUTE_NORM_TIMER = 'compute_norm' @@ -237,6 +238,10 @@ def step(self, closure=None): return self.overflow grads_groups_flat = [] + non_experts_grads_for_norm = [] + expert_grads_for_norm = {} + assert len(self.fp16_groups) == len(self.optimizer.param_groups) + for i, group in enumerate(self.fp16_groups): data_type = self.fp32_groups_flat[i].dtype @@ -250,15 +255,25 @@ def step(self, closure=None): p.grad = None self.fp32_groups_flat[i].grad = grads_groups_flat[i] + param_group = self.optimizer.param_groups[i] + if self.has_moe_layers and is_moe_param_group(param_group): + if param_group['name'] not in expert_grads_for_norm: + expert_grads_for_norm[param_group['name']] = [] + expert_grads_for_norm[param_group['name']].append(self.fp32_groups_flat[i]) + else: + non_experts_grads_for_norm.append(self.fp32_groups_flat[i]) self.timers(COMPUTE_NORM_TIMER).start() - all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu) + all_groups_norm = get_grad_norm(non_experts_grads_for_norm, mpu=self.mpu) self.timers(COMPUTE_NORM_TIMER).stop() if self.has_moe_layers: - all_groups_norm = self._get_norm_with_moe_layers(all_groups_norm) + all_groups_norm = get_norm_with_moe_layers(all_groups_norm, + mpu=self.mpu, + expert_tensors=expert_grads_for_norm, + norm_type=self.norm_type) scaled_global_grad_norm = get_global_norm(norm_list=[all_groups_norm]) @@ -290,20 +305,6 @@ def step(self, closure=None): return self.overflow - def _get_norm_with_moe_layers(self, all_groups_norm): - #all_groups_norm_old = all_groups_norm - # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce - if self.using_pipeline: - pg = self.deepspeed.mpu.get_data_parallel_group() - else: - pg = groups._get_data_parallel_group() - scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg)) - scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[0].device, dtype=torch.float) - dist.all_reduce(scaled_norm_tensor, group=pg) - all_groups_norm = scaled_norm_tensor.item() - #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}") - return all_groups_norm - def unscale_and_clip_grads(self, grad_groups_flat, total_norm, apply_scale=True): # compute combined scale factor for this group combined_scale = self.cur_scale diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index ae3e5f573850..f8a50393ac5d 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed import comm as dist diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py index 9cd2e0f25648..0f70782fd3ff 100644 --- a/deepspeed/runtime/fp16/onebit/lamb.py +++ b/deepspeed/runtime/fp16/onebit/lamb.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed import comm as dist -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from deepspeed.accelerator import get_accelerator diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py index 9ef671e7e3b7..bd75ccd4f7a0 100644 --- a/deepspeed/runtime/fp16/onebit/zoadam.py +++ b/deepspeed/runtime/fp16/onebit/zoadam.py @@ -7,7 +7,7 @@ import torch import numpy as np from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed import comm as dist diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 14271255df2e..530355f846e2 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -11,10 +11,11 @@ import torch from torch._utils import _flatten_dense_tensors -from deepspeed.runtime import DeepSpeedOptimizer -from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version +from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer +from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE from deepspeed.utils import logger +from deepspeed.utils.torch import required_torch_version from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT from deepspeed.accelerator import get_accelerator from deepspeed import comm as dist diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index ef1c98a95c7b..1dda7f1aad32 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -4,6 +4,7 @@ # DeepSpeed Team from types import MethodType +from collections import OrderedDict import torch from deepspeed import comm as dist @@ -194,9 +195,15 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs): #stores the loss for the entire batch self.total_loss = None + self.total_additional_losses = None self.agg_loss = torch.tensor(0.0, requires_grad=False).to(self.device) self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device) + # stores aggregated-DP train final loss and aggregated-DP additional losses, if any + # additional losses are stored as dict: {loss-name: agg-loss} + self.agg_train_loss = None + self.agg_additional_losses = None + if self._config.pipeline['activation_checkpoint_interval'] > 0: self.module.activation_checkpoint_interval = self._config.pipeline['activation_checkpoint_interval'] # set use_reentrant default to True. @@ -284,10 +291,7 @@ def _exec_reduce_grads(self): self._force_grad_boundary = False def _bf16_reduce_grads(self): - # Make our own list of gradients from the optimizer's FP32 grads - grads = [] - self.buffered_allreduce_fallback(grads=self.optimizer.get_grads_for_reduction(), - elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE) + self.buffered_allreduce_fallback(grads=None, elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE) def _reserve_pipe_buffers(self, num_buffers): """Ensure that each pipeline buffer has at least ``num_buffers`` slots. @@ -363,6 +367,7 @@ def train_batch(self, data_iter=None): self.module.train() self.total_loss = None + self.total_additional_losses = None self._compute_loss = True # Do the work @@ -371,7 +376,9 @@ def train_batch(self, data_iter=None): stages=self.num_stages, stage_id=self.stage_id) self._exec_schedule(sched) - self.agg_train_loss = self._aggregate_total_loss() + + with torch.no_grad(): + self.agg_train_loss = self._aggregate_total_loss() self.timers(TRAIN_BATCH_TIMER).stop() @@ -380,10 +387,12 @@ def train_batch(self, data_iter=None): elapsed = self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True) / 1000.0 iter_time = elapsed / self.steps_per_print() tput = self.train_batch_size() / iter_time - print(f'steps: {self.global_steps} ' - f'loss: {self.agg_train_loss:0.4f} ' - f'iter time (s): {iter_time:0.3f} ' - f'samples/sec: {tput:0.3f}') + log_str = f'steps: {self.global_steps} loss: {self.agg_train_loss:0.4f} ' + if self.agg_additional_losses is not None: + for loss_name, loss_value in self.agg_additional_losses.items(): + log_str += f'{loss_name}: {loss_value.item():0.4f} ' + log_str += f'iter time (s): {iter_time:0.3f} samples/sec: {tput:0.3f}' + print(log_str) else: self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True) @@ -565,29 +574,66 @@ def _bcast_pipe_scalar(self, data, src_rank=None, dtype=torch.float32): def _aggregate_total_loss(self): # Scale loss, average among DP ranks, and bcast loss to the rest of my DP group if self.is_last_stage(): + # Scale loss and additional losses, if any loss = self._scale_loss_by_gas(self.total_loss) - self.dp_group_loss = loss.clone().detach() + self.agg_additional_losses = self.total_additional_losses + if self.agg_additional_losses is not None: + self.agg_additional_losses = OrderedDict({ + loss_name: self._scale_loss_by_gas(_loss.clone().detach()) + for loss_name, _loss in self.agg_additional_losses.items() + }) - ## Average loss across all data-parallel groups + self.dp_group_loss = loss.clone().detach() agg_loss = self.dp_group_loss.clone().detach() #print(f'RANK={self.global_rank} bcast SENDER src={self.global_rank} group={self.grid.pp_group}', flush=True) + + # Average loss across all data-parallel groups if self.is_data_parallel: - dist.all_reduce(agg_loss, group=self.mpu.get_data_parallel_group()) - agg_loss /= self.dp_world_size + if self.agg_additional_losses is None: + dist.all_reduce(agg_loss, group=self.mpu.get_data_parallel_group()) + agg_loss /= self.dp_world_size + else: + # use a single reduce op for agg_loss and additional losses, if any + assert '__train_loss__' not in self.agg_additional_losses.keys() + tensors = OrderedDict({'__train_loss__': agg_loss}) + tensors.update(self.agg_additional_losses.items()) + flat_tensor = torch.cat([t.clone().reshape(-1).detach() for t in tensors.values()]) + dist.all_reduce(flat_tensor, group=self.mpu.get_data_parallel_group()) + flat_tensor /= self.dp_world_size + offset = 0 + reduced_tensor = {} + for name, t in tensors.items(): + n_elem = t.numel() + reduced_tensor[name] = flat_tensor[offset:offset + n_elem].clone().detach().reshape(t.shape) + offset += n_elem + agg_loss = reduced_tensor['__train_loss__'] + self.agg_additional_losses = OrderedDict( + {name: reduced_tensor[name] + for name in self.agg_additional_losses.keys()}) assert self.global_rank in self.grid.pp_group - losses = torch.stack([self.dp_group_loss, agg_loss]).float() + losses = [self.dp_group_loss, agg_loss] + if self.agg_additional_losses is not None: + losses += list(self.agg_additional_losses.values()) + losses = torch.stack(losses).float() if self.is_pipe_parallel: dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group()) else: # Get loss from last stage src_rank = self.grid.stage_to_global(self.num_stages - 1) assert src_rank in self.grid.pp_group - losses = torch.Tensor([0., 0.]).to(self.device) + # losses to reduce are: dp_group_loss, agg_loss, model additional losses + # therefore: 2 + n_additional_losses + additional_losses = self.module.get_additional_losses() + n_additional_losses = 0 if additional_losses is None else len(additional_losses) + losses = torch.Tensor([0.] * (2 + n_additional_losses)).to(self.device) dist.broadcast(tensor=losses, src=src_rank, group=self.grid.get_pipe_parallel_group()) self.dp_group_loss = losses[0].clone().detach() agg_loss = losses[1].clone().detach() - + if additional_losses is not None: + self.agg_additional_losses = OrderedDict( + {name: losses[2 + i].clone().detach() + for i, name in enumerate(additional_losses.keys())}) return agg_loss def set_dataloader(self, loader): @@ -715,19 +761,34 @@ def _exec_forward_pass(self, buffer_id): self.loss = outputs if self.eval_return_logits: self.outputs = outputs + if isinstance(self.loss, torch.Tensor): self.fwd_outputs.append(self.loss.detach()) - - if self.total_loss is None: - self.total_loss = torch.zeros_like(self.loss) - self.total_loss += self.loss.detach() else: self.fwd_outputs.append([l.detach() for l in self.loss]) - if self.total_loss is None: - self.total_loss = [torch.zeros_like(l) for l in self.loss] - for idx, l in enumerate(self.loss): - self.total_loss[idx] += l.detach() + def add_to_total_loss(_total_loss, _loss): + if isinstance(_loss, torch.Tensor): + if _total_loss is None: + _total_loss = torch.zeros_like(_loss) + _total_loss += _loss.detach() + else: + if _total_loss is None: + _total_loss = [torch.zeros_like(_l) for _l in _loss] + for _idx, _l in enumerate(_loss): + _total_loss[_idx] += _l.detach() + return _total_loss + + self.total_loss = add_to_total_loss(self.total_loss, self.loss) + + # aggregate additional losses across gradient accumulation steps + additional_losses = self.module.get_additional_losses() + if additional_losses is not None: + if self.total_additional_losses is None: + self.total_additional_losses = OrderedDict() + for name, loss in additional_losses.items(): + total = self.total_additional_losses[name] if name in self.total_additional_losses else None + self.total_additional_losses[name] = add_to_total_loss(total, loss) def _exec_backward_pass(self, buffer_id): assert self.optimizer is not None, "must provide optimizer during " \ @@ -1332,7 +1393,7 @@ def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None, f strict (bool, optional): Strict state loading. Defaults to True. """ assert custom_load_fn is None, "custom_load_fn not supported w. pipeline parallelism" - state_dict = checkpoint['module'] + state_dict = checkpoint if self.has_moe_layers else checkpoint['module'] if (state_dict is not None) and (not isinstance(state_dict, str)): super().load_module_state_dict(state_dict, strict) return @@ -1371,3 +1432,6 @@ def _exec_schedule(self, pipe_schedule): # Equivalent to: self._exec_forward_pass(buffer_id=0) self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self) self._exec_instr(**cmd.kwargs) + + def get_additional_losses(self): + return self.agg_additional_losses diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index c11379b0a0d7..8036faef72ee 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -634,3 +634,10 @@ def _is_checkpointable(self, funcs): return all(f.__class__.__name__ in self.checkpointable_layers for f in funcs) params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)] return any(len(list(p)) > 0 for p in params) + + def get_additional_losses(self): + """ Returns model specific additional losses for reporting + + Return a dictionary of {"loss name": loss_value} or None if no additional losses. + """ + return None diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py index 31b9a14bf0f4..2b12a9573c4b 100644 --- a/deepspeed/runtime/pipe/p2p.py +++ b/deepspeed/runtime/pipe/p2p.py @@ -9,9 +9,7 @@ import torch from deepspeed import comm as dist -# To query whether we have send/recv support -from packaging.version import Version -from deepspeed.git_version_info import torch_info +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator _groups = None @@ -21,9 +19,7 @@ def can_send_recv() -> bool: - torch_version = Version(torch_info['version']) - sendrecv_min = Version('1.8') - return torch_version >= sendrecv_min + return required_torch_version(min_version=1.8) #initializes adjacent process groups diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index d1ebe4b2f83d..c55f8a0e2995 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -14,7 +14,6 @@ import psutil import gc from math import sqrt -from packaging import version as pkg_version import torch from deepspeed import comm as dist @@ -25,6 +24,8 @@ from torch import inf from deepspeed.utils import groups, logger +from deepspeed.utils.bwc import (bwc_tensor_model_parallel_rank, bwc_pipeline_parallel_world_size, + bwc_pipeline_parallel_group) from deepspeed.runtime.constants import PIPE_REPLICATED from numpy import prod from deepspeed.accelerator import get_accelerator @@ -117,44 +118,6 @@ def is_model_parallel_parameter(p) -> bool: return False -def bwc_tensor_model_parallel_rank(mpu=None): - """Backwards-compatible way of querying the tensor model parallel rank from - an ``mpu`` object. - - *Tensor* model parallelism means that tensors are physically split across - processes. This contrasts with *pipeline* model parallelism, in which the - layers are partitioned but tensors left intact. - - The API for tensor model parallelism has changed across versions and this - helper provides a best-effort implementation across versions of ``mpu`` - objects. The preferred mechanism is - ``mpu.get_tensor_model_parallel_rank()``. - - This should "just work" with both Megatron-LM and DeepSpeed's pipeline - parallelism. - - Args: - mpu (model parallel unit, optional): The tensor model parallel rank. - If ``mpu=None``, returns 0. Defaults to ``None``. - - Returns: - int: the rank - """ - if mpu is None: - # No model parallelism in easy :) - return 0 - - if hasattr(mpu, 'get_tensor_model_parallel_rank'): - # New Megatron and DeepSpeed convention (post pipeline-parallelism release) - return mpu.get_tensor_model_parallel_rank() - elif hasattr(mpu, 'get_slice_parallel_rank'): - # Some DeepSpeed + pipeline parallelism versions - return mpu.get_slice_parallel_rank() - else: - # Deprecated Megatron and DeepSpeed convention - return mpu.get_model_parallel_rank() - - def copy_to_device(item, device, criterion_func): """ Return a copy of tensor on specified device. @@ -205,6 +168,17 @@ def move_to_device(item, device, criterion_func): return item +def get_norm_with_moe_layers_fast(all_groups_norm, group): + # This implementation standardizes the grad_norm across ranks. A more precise implementation can be found in 'get_norm_with_moe_layers'. + # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce + scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=group)) + scaled_norm_tensor = torch.tensor(scaled_norm, device=get_accelerator().current_device(), dtype=torch.float) + dist.all_reduce(scaled_norm_tensor, group=group) + all_groups_norm = scaled_norm_tensor.item() + #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}") + return all_groups_norm + + class CheckOverflow(object): '''Checks for overflow in gradient across parallel process''' @@ -369,8 +343,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): for p in parameters: all_norms.append(p.grad.data.abs().max().float()) total_norm = torch.stack(all_norms).max() - origin_device = total_norm.device.type - total_norm = total_norm.to(get_accelerator().device_name()) + total_norm = total_norm.to(get_accelerator().current_device_name()) # Take max across all GPUs. if mpu is not None: dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) @@ -387,9 +360,8 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): if len(all_norms) > 0: total_norm = torch.stack(all_norms).square().sum().float() else: - total_norm = torch.FloatTensor([0.0]).to(parameters[0].device) - origin_device = total_norm.device.type - total_norm = total_norm.to(get_accelerator().device_name()) + total_norm = get_accelerator().FloatTensor([0.0]) + total_norm = total_norm.to(get_accelerator().current_device_name()) # Sum across all model parallel GPUs. if mpu is not None: dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) @@ -402,11 +374,11 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): dist.all_reduce(scaled_norm_tensor, group=pg) total_norm = scaled_norm_tensor - total_norm = total_norm.to(origin_device) + total_norm = total_norm.to(parameters[0].device) - max_norm = torch.tensor([float(max_norm)], device=parameters[0].device) + max_norm = torch.tensor([float(max_norm)], device=total_norm.device) clip_coef = max_norm / (total_norm + 1e-6) - tmp_tensor = torch.tensor([1.0], device=parameters[0].device) + tmp_tensor = torch.tensor([1.0], device=clip_coef.device) clip_coef = torch.min(tmp_tensor, clip_coef) for p in parameters: p.grad.data.mul_(clip_coef) @@ -861,7 +833,7 @@ def clip_gradients(parameters, max_norm=1.0, global_grad_norm=None, mpu=None, ep return global_grad_norm -def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False): +def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False, moe_ep_group=None): """Get norm of an iterable of tensors. This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and @@ -879,37 +851,64 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=F assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors' norm_type = float(norm_type) + all_norms = [] if norm_type == inf: - total_norm = max(t.data.abs().max() for t in input_tensors) - total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) + for t in input_tensors: + all_norms.append(t.data.abs().max().float()) + total_norm = torch.stack(all_norms).max() + device_total_norm = total_norm.to(get_accelerator().current_device_name()) + # Max across model parallel if mpu is not None: - dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) - total_norm = total_norm_cuda[0].item() + # For MoE grads, max over model parallel only if MoE-TP is enabled + if moe_ep_group is None or groups._get_expert_model_parallel_world_size() > 1: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group()) + # If MoE grads and MoE-TP disabled, max over pipeline parallel + elif bwc_pipeline_parallel_world_size(mpu) > 1: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=bwc_pipeline_parallel_group(mpu)) + + # MoE grads: max across expert parallel group + if moe_ep_group is not None: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=moe_ep_group) + total_norm = device_total_norm.to(input_tensors[0].device) else: - if use_graph: - if 'norm_tensors_compute_buffer' not in graph_cache: - graph_cache['norm_tensors_compute_buffer'] = [t.data.float().norm(norm_type) for t in input_tensors] - compute_buffer = graph_cache['norm_tensors_compute_buffer'] - def _norm_tensors(tensor_list, _compute_buffer, _norm_type): - for i, t in enumerate(tensor_list): - _compute_buffer[i].data.copy_(t.data.float().norm(_norm_type)**_norm_type) - if i != 0: - _compute_buffer[0].data.add_(_compute_buffer[i].data) + if 'norm_tensors_compute_buffer' not in graph_cache or len( + graph_cache['norm_tensors_compute_buffer']) != len(input_tensors): + graph_cache['norm_tensors_compute_buffer'] = [ + torch.empty([], dtype=torch.float, device=get_accelerator().current_device_name()) + for t in input_tensors + ] + compute_buffer = graph_cache['norm_tensors_compute_buffer'] - graph_process(False, _norm_tensors, input_tensors, compute_buffer, norm_type) + def _norm_tensors(tensor_list, _compute_buffer, _norm_type): + for i, t in enumerate(tensor_list): + _compute_buffer[i].data.copy_(t.data.float().norm(_norm_type)**_norm_type) + if i != 0: + _compute_buffer[0].data.add_(_compute_buffer[i].data) - total_norm = compute_buffer[0] + if use_graph: + graph_process(False, _norm_tensors, input_tensors, compute_buffer, norm_type) else: - total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors]) + _norm_tensors(input_tensors, compute_buffer, norm_type) - total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]).detach() + device_total_norm = compute_buffer[0].float().detach() + + # Sum across model parallel if mpu is not None: - dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) - total_norm = total_norm_cuda[0].item()**(1. / norm_type) + # For MoE grads, sum over model parallel only if MoE-TP is enabled + if moe_ep_group is None or groups._get_expert_model_parallel_world_size() > 1: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group()) + # If MoE grads and MoE-TP disabled, sum over pipeline parallel + elif bwc_pipeline_parallel_world_size(mpu) > 1: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=bwc_pipeline_parallel_group(mpu)) - if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm: - total_norm = -1 + # MoE grads: sum across expert parallel group + if moe_ep_group is not None: + dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=moe_ep_group) + total_norm = device_total_norm.to(input_tensors[0].device).pow(1. / norm_type) + + inf_or_nan = total_norm.isinf().logical_or(total_norm.isnan()) + total_norm.masked_fill_(inf_or_nan, -1) return total_norm @@ -1036,15 +1035,43 @@ def get_inactive_params(param_list): param.ds_status == ZeroParamStatus.NOT_AVAILABLE)] -def required_torch_version(min_version=None, max_version=None): - assert min_version or max_version, "Must provide a min_version or max_version argument" +def get_norm_with_moe_layers(non_expert_norm, mpu, expert_tensors, norm_type=2): + """ Compute the global norm with MoE experts - torch_version = pkg_version.parse(torch.__version__) + Inputs: + non_expert_norm (float) : the calculated norm of the non-expert params + expert_tensors (Dict[ep_name, List[Tensor]): Dictionary of expert group name to list of grad tensors + norm_type (int): the norm to use - if min_version and pkg_version.parse(str(min_version)) > torch_version: - return False + Returns: + if norm is (-/+) inf, returns -1 + otherwise the global norm (float) + """ - if max_version and pkg_version.parse(str(max_version)) < torch_version: - return False + def to_tensor(v): + return get_accelerator().FloatTensor(float(v)).detach() + + group_norms = [non_expert_norm] + for exp_name, tensors in expert_tensors.items(): + group_norm = get_global_norm_of_tensors(input_tensors=tensors, + mpu=mpu, + norm_type=norm_type, + use_graph=False, + moe_ep_group=groups._get_expert_parallel_group(exp_name)) + group_norms.append(group_norm) + + # check if all norms are valid + group_norms = torch.stack([to_tensor(norm) for norm in group_norms]) + if group_norms.eq(-1).any(): + return -1 - return True + # combine norms + if norm_type == inf: + total_norm = group_norms.max().item() + else: + total_norm = group_norms.pow(norm_type).sum() + total_norm = total_norm.item()**(1. / norm_type) + if total_norm == float('inf') or total_norm == -float('inf'): + total_norm = -1 + + return total_norm diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 142259c1b7df..c8099791f882 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -56,7 +56,8 @@ def __init__(self, param: Parameter) -> None: self.__param = param def wait(self) -> None: - get_accelerator().current_stream().synchronize() + if not get_accelerator().is_synchronized_device(): + get_accelerator().current_stream().synchronize() self.__param.ds_status = ZeroParamStatus.AVAILABLE @@ -81,7 +82,8 @@ def wait(self) -> None: if self.__complete: return - get_accelerator().current_stream().synchronize() + if not get_accelerator().is_synchronized_device(): + get_accelerator().current_stream().synchronize() for param in self.__params: assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight" param.ds_status = ZeroParamStatus.AVAILABLE @@ -363,7 +365,8 @@ def _set_dtype(self, ds_config, dtype): else: self.dtype = torch.float else: - self.dtype = dtype or torch.half + self.dtype = dtype or torch.float16 if get_accelerator().is_fp16_supported( + ) else torch.bfloat16 if get_accelerator().is_bf16_supported else torch.float32 def patch_init_and_builtins(self): diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index eb611241fb05..706d4fb12fb0 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -11,7 +11,7 @@ from deepspeed.utils import groups from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from deepspeed.runtime import ZeROOptimizer +from deepspeed.runtime.base_optimizer import ZeROOptimizer from deepspeed.utils import logger from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index b1d94a4459d9..16b9c3c18919 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -4,21 +4,20 @@ # DeepSpeed Team import torch -import os from deepspeed import comm as dist from packaging import version as pkg_version from collections import OrderedDict from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from deepspeed.runtime import ZeROOptimizer +from deepspeed.runtime.base_optimizer import ZeROOptimizer from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler -from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, empty_cache, see_memory_usage, inf, - is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups) - +from deepspeed.runtime.utils import (empty_cache, see_memory_usage, inf, is_model_parallel_parameter, + align_dense_tensors, all_gather_dp_groups) from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum from deepspeed.ops.adam import DeepSpeedCPUAdam from deepspeed.utils import logger +from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank from deepspeed.moe.utils import is_moe_param from deepspeed.git_version_info import version @@ -1360,7 +1359,7 @@ def reduce_ipg_grads(self): self.average_tensor(extra_large_grad_reduc.view(-1)) self.extra_large_param_to_reduce = None else: - self.average_tensor(self.ipg_buffer[self.ipg_index]) + self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket)) else: self.buffered_reduce_fallback(None, self.grads_in_ipg_bucket, @@ -2287,36 +2286,13 @@ def load_state_dict(self, self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights) def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights): - self._load_hp_checkpoint_state(checkpoint_folder) + self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder) @property def param_groups(self): """Forward the wrapped optimizer's parameters.""" return self.optimizer.param_groups - def _load_hp_checkpoint_state(self, checkpoint_dir): - checkpoint_dir = os.path.join(checkpoint_dir, "zero") - optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt") - assert os.path.isfile( - optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.' - optim_sd = torch.load(optim_state_path) - self._load_global_state(optim_sd) - - tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu) - if self.mpu is None: - logger.warn("MPU is not provided, setting tp size to 1 in checkpoint loading.") - tp_world_size = 1 - else: - tp_world_size = self.mpu.get_slice_parallel_world_size() if hasattr(self.mpu, "get_slice_parallel_world_size") \ - else self.mpu.get_tensor_model_parallel_world_size() - - for i, _ in enumerate(self.optimizer.param_groups): - for lp in self.bit16_groups[i]: - if lp._hp_mapping is not None: - #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}") - lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank, - tp_world_size) - def _load_global_state(self, sd): self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler) self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale) diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py index 33ea8ba60818..75fb6aa9d30a 100644 --- a/deepspeed/utils/__init__.py +++ b/deepspeed/utils/__init__.py @@ -10,7 +10,7 @@ from .groups import * from .nvtx import instrument_w_nvtx # TODO: Move tensor fragment and mixed precision to zero utils -from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad +from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad, map_to_flat_opt_states from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state from .tensor_fragment import set_full_hp_param from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state diff --git a/deepspeed/utils/bwc.py b/deepspeed/utils/bwc.py new file mode 100644 index 000000000000..69fcc251a684 --- /dev/null +++ b/deepspeed/utils/bwc.py @@ -0,0 +1,104 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + + +def bwc_tensor_model_parallel_rank(mpu=None): + """Backwards-compatible way of querying the tensor model parallel rank from + an ``mpu`` object. + + *Tensor* model parallelism means that tensors are physically split across + processes. This contrasts with *pipeline* model parallelism, in which the + layers are partitioned but tensors left intact. + + The API for tensor model parallelism has changed across versions and this + helper provides a best-effort implementation across versions of ``mpu`` + objects. The preferred mechanism is + ``mpu.get_tensor_model_parallel_rank()``. + + This should "just work" with both Megatron-LM and DeepSpeed's pipeline + parallelism. + + Args: + mpu (model parallel unit, optional): The tensor model parallel rank. + If ``mpu=None``, returns 0. Defaults to ``None``. + + Returns: + int: the rank + """ + if mpu is None: + # No model parallelism in easy :) + return 0 + + if hasattr(mpu, 'get_tensor_model_parallel_rank'): + # New Megatron and DeepSpeed convention (post pipeline-parallelism release) + return mpu.get_tensor_model_parallel_rank() + elif hasattr(mpu, 'get_slice_parallel_rank'): + # Some DeepSpeed + pipeline parallelism versions + return mpu.get_slice_parallel_rank() + else: + # Deprecated Megatron and DeepSpeed convention + return mpu.get_model_parallel_rank() + + +def bwc_tensor_model_parallel_world_size(mpu=None): + """Backwards-compatible way of querying the tensor model parallel world size. + Similar to bwc_tensor_model_parallel_rank. + """ + if mpu is None: + return 1 + + if hasattr(mpu, 'get_tensor_model_parallel_world_size'): + # New Megatron and DeepSpeed convention (post pipeline-parallelism release) + return mpu.get_tensor_model_parallel_world_size() + elif hasattr(mpu, 'get_slice_parallel_world_size'): + # Some DeepSpeed + pipeline parallelism versions + return mpu.get_slice_parallel_world_size() + else: + # Deprecated Megatron and DeepSpeed convention + return mpu.get_model_parallel_world_size() + + +def bwc_tensor_model_parallel_group(mpu=None): + """Backwards-compatible way of querying the tensor model parallel group. + Similar to bwc_tensor_model_parallel_rank. + """ + if mpu is None: + return None + + if hasattr(mpu, 'get_tensor_model_parallel_group'): + # New Megatron and DeepSpeed convention (post pipeline-parallelism release) + return mpu.get_tensor_model_parallel_group() + elif hasattr(mpu, 'get_slice_parallel_group'): + # Some DeepSpeed + pipeline parallelism versions + return mpu.get_slice_parallel_group() + else: + # Deprecated Megatron and DeepSpeed convention + return mpu.get_model_parallel_group() + + +def bwc_pipeline_parallel_world_size(mpu=None): + """Backwards-compatible way of querying the pipeline parallel world size.""" + world_size = 1 + if mpu is not None: + if hasattr(mpu, 'get_pipeline_model_parallel_world_size'): + # New Megatron and DeepSpeed convention (post pipeline-parallelism release) + world_size = mpu.get_pipeline_model_parallel_world_size() + elif hasattr(mpu, 'get_pipe_parallel_world_size'): + # DeepSpeed Topology + world_size = mpu.get_pipe_parallel_world_size() + return world_size + + +def bwc_pipeline_parallel_group(mpu=None): + """Backwards-compatible way of querying the pipeline parallel group.""" + if mpu is None: + return None + if hasattr(mpu, 'get_pipeline_model_parallel_group'): + # Megatron + return mpu.get_pipeline_model_parallel_group() + elif hasattr(mpu, 'get_pipe_parallel_group'): + # DeepSpeed Topology + return mpu.get_pipe_parallel_group() + assert False, 'mpu does not support pipeline parallel group' diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index 63dda7f5aaae..c49f4520e16e 100644 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -27,6 +27,7 @@ from deepspeed import comm as dist from deepspeed.utils import log_dist +from deepspeed.utils.bwc import bwc_tensor_model_parallel_world_size, bwc_pipeline_parallel_world_size from deepspeed.utils.exceptions import DeprecatedException from deepspeed.accelerator import get_accelerator # Expert parallel group that the current rank belongs to. @@ -128,31 +129,32 @@ def _create_expert_and_data_parallel(expert_parallel_size_, use_data_before_expe log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0]) world_size = dist.get_world_size() + pp_world_size = 1 if mpu is None else bwc_pipeline_parallel_world_size(mpu) rank = dist.get_rank() - _ensure_divisibility(world_size, expert_parallel_size_) + pp_stride = world_size // pp_world_size + _ensure_divisibility(pp_stride, expert_parallel_size_) group_name = f"ep_size_{expert_parallel_size_}" # Build the expert data parallel groups. global _EXPERT_DATA_PARALLEL_GROUP - ep_stride = world_size // expert_parallel_size_ + ep_stride = pp_stride // expert_parallel_size_ # Only create group if it does not already exist if group_name not in _EXPERT_DATA_PARALLEL_GROUP: - for i in range(expert_parallel_size_): - if use_data_before_expert_parallel_: - ranks = range(i * ep_stride, (i + 1) * ep_stride) - else: - ranks = range(i, world_size, expert_parallel_size_) - group = dist.new_group(ranks) - log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0]) - if use_data_before_expert_parallel_: - if i == (rank // ep_stride): - _EXPERT_DATA_PARALLEL_GROUP[group_name] = group - else: - if i == (rank % expert_parallel_size_): + for pp_stage_start in range(0, world_size, pp_stride): + for i in range(expert_parallel_size_): + if use_data_before_expert_parallel_: + ranks = range(pp_stage_start + i * ep_stride, pp_stage_start + (i + 1) * ep_stride) + else: + ranks = range(pp_stage_start + i, pp_stage_start + pp_stride, expert_parallel_size_) + group = dist.new_group(ranks) + log_dist( + f'Creating expert data parallel process group named {group_name} ' + f'with ranks: {list(ranks)}', [0]) + if rank in ranks: _EXPERT_DATA_PARALLEL_GROUP[group_name] = group # Build the expert parallel groups. @@ -161,24 +163,29 @@ def _create_expert_and_data_parallel(expert_parallel_size_, use_data_before_expe # Only create group if it does not already exist if group_name not in _EXPERT_PARALLEL_GROUP: if use_data_before_expert_parallel_: - for i in range(ep_stride): - ranks = range(i, world_size, ep_stride) - group = dist.new_group(ranks) - log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0]) - if i == (rank % ep_stride): - _EXPERT_PARALLEL_GROUP[group_name] = group + for pp_stage_start in range(0, world_size, pp_stride): + for i in range(ep_stride): + ranks = range(pp_stage_start + i, pp_stage_start + pp_stride, ep_stride) + group = dist.new_group(ranks) + log_dist( + f'creating expert parallel process group named {group_name} ' + f'with ranks: {list(ranks)}', [0]) + if rank in ranks: + _EXPERT_PARALLEL_GROUP[group_name] = group else: for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = dist.new_group(ranks) - log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0]) - if i == (rank // expert_parallel_size_): + log_dist(f'creating expert parallel process group named {group_name} ' + f'with ranks: {list(ranks)}', [0]) + if rank in ranks: _EXPERT_PARALLEL_GROUP[group_name] = group def _get_expert_parallel_ranks(world_size, - model_parallel_size_, + tensor_parallel_size_, expert_parallel_size_, + pipeline_parallel_size_=1, use_data_before_expert_parallel_=False): """Generate expert parallel and expert data parallel group ranks list. @@ -193,32 +200,40 @@ def _get_expert_parallel_ranks(world_size, Args: world_size (int): Distributed world size. - model_parallel_size_ (int): Model parallel group size. + tensor_parallel_size_ (int): Tensor parallel group size. expert_parallel_size_ (int): Expert parallel group size. + pipeline_parallel_size_ (int): Pipeline parallel group size use_data_before_expert_parallel_ (bool): Use the D + E instead of E + D topology Returns: Expert parallel group ranks and Expert data parallel group ranks list. """ - _ensure_divisibility(world_size, model_parallel_size_) - dp_world_size = world_size // model_parallel_size_ + _ensure_divisibility(world_size, tensor_parallel_size_ * pipeline_parallel_size_) + dp_world_size = world_size // (tensor_parallel_size_ * pipeline_parallel_size_) _ensure_divisibility(dp_world_size, expert_parallel_size_) # Generate data parallel groups data_parallel_groups = [] - dp_group_size = model_parallel_size_ + dp_group_size = tensor_parallel_size_ + pp_stride = world_size // pipeline_parallel_size_ if use_data_before_expert_parallel_: - dp_stride = world_size // expert_parallel_size_ // model_parallel_size_ - for i in range(dp_group_size): - data_parallel_groups.append(list()) - for ds in range(dp_stride): - # [0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30] - # [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31] - data_parallel_groups[-1].extend( - list(range(i + ds * model_parallel_size_, world_size, dp_stride * model_parallel_size_))) + dp_stride = world_size // expert_parallel_size_ // tensor_parallel_size_ // pipeline_parallel_size_ + for pp_stage_start in range(0, world_size, pp_stride): + pp_stage_next = pp_stage_start + pp_stride + for i in range(dp_group_size): + data_parallel_groups.append(list()) + for ds in range(dp_stride): + # [0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30] + # [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31] + data_parallel_groups[-1].extend( + list( + range(pp_stage_start + i + ds * tensor_parallel_size_, pp_stage_next, + dp_stride * tensor_parallel_size_))) else: - for i in range(dp_group_size): - data_parallel_groups.append(list(range(i, world_size, dp_group_size))) + for pp_stage_start in range(0, world_size, pp_stride): + pp_stage_next = pp_stage_start + pp_stride + for i in range(dp_group_size): + data_parallel_groups.append(list(range(pp_stage_start + i, pp_stage_next, dp_group_size))) expert_parallel_groups = [] expert_data_parallel_groups = [] @@ -252,36 +267,33 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu, use_data_ expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15] """ assert dist.is_initialized(), "dist is not initialized" - model_parallel_size_ = mpu.get_model_parallel_world_size() + tensor_parallel_size_ = bwc_tensor_model_parallel_world_size(mpu) global expert_tensor_parallel_world_size - expert_tensor_parallel_world_size = model_parallel_size_ + expert_tensor_parallel_world_size = tensor_parallel_size_ world_size = dist.get_world_size() rank = dist.get_rank() dp_world_size = mpu.get_data_parallel_world_size() - dp_rank = mpu.get_data_parallel_rank() + pp_world_size = 1 if mpu is None else bwc_pipeline_parallel_world_size(mpu) - _ensure_divisibility(world_size, model_parallel_size_) + _ensure_divisibility(world_size, tensor_parallel_size_) _ensure_divisibility(dp_world_size, expert_parallel_size_) log_dist( - f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}", - [0]) + f"Creating deepspeed groups with model parallel size {tensor_parallel_size_}, " + f"pipeline parallel size {pp_world_size}, expert parallel size {expert_parallel_size_}, " + f"world size {world_size}, dp world size {dp_world_size}", [0]) global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP - # Get world size and rank. Ensure some consistencies. - _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group() - _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group() - group_name = f"ep_size_{expert_parallel_size_}" # Only create groups if they don't already exist # Need to check conditions outside the group creation loop because of the way torch.dist group creation works if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP: expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks( - world_size, model_parallel_size_, expert_parallel_size_, use_data_before_expert_parallel_) + world_size, tensor_parallel_size_, expert_parallel_size_, pp_world_size, use_data_before_expert_parallel_) for ranks in expert_parallel_groups: group = dist.new_group(ranks) if rank in list(ranks): diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py index 49eefafcfbcc..b34722580ddd 100644 --- a/deepspeed/utils/tensor_fragment.py +++ b/deepspeed/utils/tensor_fragment.py @@ -58,6 +58,21 @@ def get_hp_fragment(self, optim_state_key=None): return self.get_optim_state_fragment(optim_state_key) +def map_to_flat_opt_states(flat_hp_tensor, lp_tensors, optim_state, opt_keys): + for key in opt_keys: + hp_param = flat_hp_tensor + buffer = torch.zeros_like(hp_param) + + for lp in lp_tensors: + if lp._hp_mapping is not None: + hp_fragment_address = lp._hp_mapping.get_hp_fragment_address() + hp_fragment = buffer.narrow(0, hp_fragment_address.start, hp_fragment_address.numel) + hp_fragment.data.copy_(lp._hp_mapping.get_hp_fragment(optim_state_key=key).data) + lp._hp_mapping.hp_fragment = hp_fragment + + optim_state[hp_param][key] = buffer + + def get_full_hp_param(self, optim_state_key=None): reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten() if self._hp_mapping is not None: diff --git a/deepspeed/utils/torch.py b/deepspeed/utils/torch.py new file mode 100644 index 000000000000..eb22d3561035 --- /dev/null +++ b/deepspeed/utils/torch.py @@ -0,0 +1,22 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from packaging import version as pkg_version + +import torch + + +def required_torch_version(min_version=None, max_version=None): + assert min_version or max_version, "Must provide a min_version or max_version argument" + + torch_version = pkg_version.parse(torch.__version__) + + if min_version and pkg_version.parse(str(min_version)) > torch_version: + return False + + if max_version and pkg_version.parse(str(max_version)) < torch_version: + return False + + return True diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md index db1a6005f793..88a43236ce9d 100644 --- a/docs/_tutorials/accelerator-abstraction-interface.md +++ b/docs/_tutorials/accelerator-abstraction-interface.md @@ -79,13 +79,13 @@ torch.distributed.init_process_group(get_accelerator().communication_backend_nam ``` # Run DeepSpeed model on different accelerators -Once a model is ported with DeepSpeed Accelerator Abstraction Interface, we can run this model on different accelerators using extension to DeepSpeed. DeepSpeed check whether certain extension is installed in the environment to decide whether to use the Accelerator backend in that extension. For example if we wish to run model on Intel GPU, we can install _Intel Extension for DeepSpeed_ following the instruction in [link](https://github.com/intel/intel-extension-for-deepspeed/) +Once a model is ported with DeepSpeed Accelerator Abstraction Interface, we can run this model on different accelerators using an extension to DeepSpeed. DeepSpeed checks whether a certain extension is installed in the environment to decide whether to use the Accelerator backend in that extension. For example, if we wish to run a model on Intel GPU, we can install _Intel Extension for DeepSpeed_ following the instructions in the following [link](https://github.com/intel/intel-extension-for-deepspeed/) -After the extension is installed, install DeepSpeed and run model. The model will be running on top of DeepSpeed. Because DeepSpeed installation is also accelerator related, it is recommended to install DeepSpeed accelerator extension before install DeepSpeed. +After the extension is installed, install DeepSpeed and run the model. The model will be running on top of DeepSpeed. Because DeepSpeed installation is also accelerator related, it is recommended to install DeepSpeed accelerator extension before installing DeepSpeed. `CUDA_Accelerator` is the default accelerator in DeepSpeed. If no other DeepSpeed accelerator extension is installed, `CUDA_Accelerator` will be used. -When run a model on different accelerator in a cloud environment, the recommended practice is provision environment for each accelerator in different env with tool such as _anaconda/miniconda/virtualenv_. When run model on different Accelerator, load the env accordingly. +When running a model on different accelerators in a cloud environment, the recommended practice is to provision an environment for each accelerator in a different env with tools such as _anaconda/miniconda/virtualenv_. When running models on different Accelerator, load the env accordingly. Note that different accelerator may have different 'flavor' of float16 or bfloat16. So it is recommended to make the model configurable for both float16 and bfloat16, in that way model code does not need to be changed when running on different accelerators. diff --git a/docs/index.md b/docs/index.md index e3351ee1a3d7..1ca92019bff2 100755 --- a/docs/index.md +++ b/docs/index.md @@ -94,7 +94,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | | Documentation | | ---------------------------------------------------------------------------------------------- | -------------------------------------------- | | | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) | -| | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/main/en/deepspeed) | +| | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) | | | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) | | | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) | diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py index 9c41f35eaf1b..ff11ca180072 100644 --- a/op_builder/all_ops.py +++ b/op_builder/all_ops.py @@ -30,3 +30,4 @@ __op_builders__.append(builder) ALL_OPS = {op.name: op for op in __op_builders__ if op is not None} +accelerator_name = get_accelerator()._name diff --git a/op_builder/builder.py b/op_builder/builder.py index dd77f967cc60..8dc825c7926d 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -464,8 +464,9 @@ def load(self, verbose=True): if self.name in __class__._loaded_ops: return __class__._loaded_ops[self.name] - from deepspeed.git_version_info import installed_ops, torch_info - if installed_ops.get(self.name, False): + from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name + from deepspeed.accelerator import get_accelerator + if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name: # Ensure the op we're about to load was compiled with the same # torch/cuda versions we are currently using at runtime. self.validate_torch_version(torch_info) diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py index b26328341081..38e965530f43 100644 --- a/op_builder/cpu/comm.py +++ b/op_builder/cpu/comm.py @@ -19,7 +19,7 @@ def absolute_name(self): return f'deepspeed.ops.comm.{self.NAME}_op' def sources(self): - return ['csrc/cpu/comm/ccl.cpp'] + return ['csrc/cpu/comm/ccl.cpp', 'csrc/cpu/comm/shm.cpp'] def include_paths(self): includes = ['csrc/cpu/includes'] diff --git a/op_builder/fp_quantizer.py b/op_builder/fp_quantizer.py new file mode 100644 index 000000000000..bafd3e0c33f6 --- /dev/null +++ b/op_builder/fp_quantizer.py @@ -0,0 +1,63 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .builder import CUDAOpBuilder, installed_cuda_version + + +class FPQuantizerBuilder(CUDAOpBuilder): + BUILD_VAR = "DS_BUILD_FP_QUANTIZER" + NAME = "fp_quantizer" + + def __init__(self, name=None): + name = self.NAME if name is None else name + super().__init__(name=name) + + def absolute_name(self): + return f'deepspeed.ops.fp_quantizer.{self.NAME}_op' + + def is_compatible(self, verbose=True): + try: + import torch + except ImportError: + self.warning("Please install torch if trying to pre-compile inference kernels") + return False + + cuda_okay = True + if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda + sys_cuda_major, _ = installed_cuda_version() + torch_cuda_major = int(torch.version.cuda.split('.')[0]) + cuda_capability = torch.cuda.get_device_properties(0).major #ignore-cuda + if cuda_capability < 8: + self.warning("NVIDIA Inference is only supported on Ampere and newer architectures") + cuda_okay = False + if cuda_capability >= 8: + if torch_cuda_major < 11 or sys_cuda_major < 11: + self.warning("On Ampere and higher architectures please use CUDA 11+") + cuda_okay = False + return super().is_compatible(verbose) and cuda_okay + + def filter_ccs(self, ccs): + ccs_retained = [] + ccs_pruned = [] + for cc in ccs: + if int(cc[0]) >= 8: + ccs_retained.append(cc) + else: + ccs_pruned.append(cc) + if len(ccs_pruned) > 0: + self.warning(f"Filtered compute capabilities {ccs_pruned}") + return ccs_retained + + def sources(self): + return [ + "csrc/fp_quantizer/quantize.cu", + "csrc/fp_quantizer/quantize.cpp", + ] + + def extra_ldflags(self): + return ['-lcurand'] + + def include_paths(self): + return ['csrc/fp_quantizer/includes', 'csrc/includes'] diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py index 459dcce6bfae..81b15f197f43 100644 --- a/op_builder/xpu/builder.py +++ b/op_builder/xpu/builder.py @@ -74,8 +74,9 @@ def fixed_aotflags(self): ] def load(self, verbose=True): - from deepspeed.git_version_info import installed_ops, torch_info # noqa: F401 - if installed_ops.get(self.name, False): + from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name # noqa: F401 + from deepspeed.accelerator import get_accelerator + if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name: return importlib.import_module(self.absolute_name()) else: return self.jit_load(verbose) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index f28c1ecb165c..eb6bfc811e85 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -6,14 +6,15 @@ future importlib-metadata>=4 mup pre-commit>=2.20.0 -pytest<=8.0.0 +pytest>=7.2.0 pytest-forked pytest-randomly pytest-xdist +qtorch==0.3.0 recommonmark sphinx sphinx-rtd-theme tensorboard torchvision -transformers>=4.32.1 +transformers>=4.39.0 wandb diff --git a/setup.py b/setup.py index 25b741af9440..f1367b850e02 100755 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ 'Please visit https://pytorch.org/ to see how to properly install torch on your system.') from op_builder import get_default_compute_capabilities, OpBuilder -from op_builder.all_ops import ALL_OPS +from op_builder.all_ops import ALL_OPS, accelerator_name from op_builder.builder import installed_cuda_version # Fetch rocm state. @@ -168,12 +168,9 @@ def op_enabled(op_name): return int(get_env_if_set(env_var, BUILD_OP_DEFAULT)) -compatible_ops = dict.fromkeys(ALL_OPS.keys(), False) install_ops = dict.fromkeys(ALL_OPS.keys(), False) for op_name, builder in ALL_OPS.items(): op_compatible = builder.is_compatible() - compatible_ops[op_name] = op_compatible - compatible_ops["deepspeed_not_implemented"] = False # If op is requested but not available, throw an error. if op_enabled(op_name) and not op_compatible: @@ -280,11 +277,10 @@ def create_dir_symlink(src, dest): fd.write(f"git_hash='{git_hash}'\n") fd.write(f"git_branch='{git_branch}'\n") fd.write(f"installed_ops={install_ops}\n") - fd.write(f"compatible_ops={compatible_ops}\n") + fd.write(f"accelerator_name='{accelerator_name}'\n") fd.write(f"torch_info={torch_info}\n") print(f'install_requires={install_requires}') -print(f'compatible_ops={compatible_ops}') print(f'ext_modules={ext_modules}') # Parse README.md to make long_description for PyPI page. diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py index cf533063d6ec..25256d376eeb 100644 --- a/tests/unit/alexnet_model.py +++ b/tests/unit/alexnet_model.py @@ -11,7 +11,7 @@ import deepspeed import deepspeed.comm as dist import deepspeed.runtime.utils as ds_utils -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py index 7442e51bad5d..3fb13b214ea0 100644 --- a/tests/unit/checkpoint/common.py +++ b/tests/unit/checkpoint/common.py @@ -14,6 +14,7 @@ from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +from unit.common import preferred_dtype from unit.simple_model import * from unittest.mock import MagicMock, patch @@ -85,15 +86,20 @@ def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load def compare_state_dicts(state0, state1, expected_mismatch_keys=[]): - for (k0, s0), (k1, s1) in zip(state0.items(), state1.items()): - assert k0 == k1, f'failure due to key mismatch {k0} != {k1}' - if k0 in expected_mismatch_keys: + key_set0 = set(k for k in state0.keys() if k not in expected_mismatch_keys) + key_set1 = set(k for k in state1.keys() if k not in expected_mismatch_keys) + assert key_set0 == key_set1, f'failure due to key mismatch {key_set0} != {key_set1}' + + for k in key_set0: + s0 = state0[k] + s1 = state1[k] + if k in expected_mismatch_keys: continue if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor): assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}' assert torch.equal(s0.to('cpu'), s1.to('cpu')) else: - assert s0 == s1, f'failures with keys = {k0}, {k1}, values = {type(s0[0])} and {type(s1[0])}' + assert s0 == s1, f'failures with keys = {k}, {k}, values = {s0} and {s1}' def compare_opt_state_dicts(state0, state1, expected_mismatch_keys=[]): @@ -163,13 +169,15 @@ def checkpoint_correctness_verification(config_dict, tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False, - fp16=True, train_batch=False, base_optimizers=[None, None], empty_tag=False, seq_dataloader=False, - load_module_only=False): - dtype = torch.half if fp16 else torch.float32 + load_module_only=False, + dtype=None): + if dtype == None: + dtype = preferred_dtype() + ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0]) if seq_dataloader: @@ -241,7 +249,7 @@ def checkpoint_correctness_verification(config_dict, load_module_only=load_module_only) if load_optimizer_states: - compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16) + compare_optimizer_states(trained_model, loaded_model, hidden_dim, dtype == torch.float16) if load_lr_scheduler_states: compare_lr_scheduler_states(trained_model, loaded_model) diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py index 41ce2278680f..5d795c4dadcf 100644 --- a/tests/unit/checkpoint/test_latest_checkpoint.py +++ b/tests/unit/checkpoint/test_latest_checkpoint.py @@ -38,8 +38,8 @@ def test_existing_latest(self, tmpdir): tmpdir=tmpdir, load_optimizer_states=True, load_lr_scheduler_states=False, - fp16=False, - empty_tag=True) + empty_tag=True, + dtype=torch.float) def test_missing_latest(self, tmpdir): config_dict = { diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py index c4c6773cd474..89c4dd1b49f7 100644 --- a/tests/unit/checkpoint/test_lr_scheduler.py +++ b/tests/unit/checkpoint/test_lr_scheduler.py @@ -5,6 +5,7 @@ import deepspeed from deepspeed.ops.op_builder import CPUAdamBuilder +from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest from unit.simple_model import * @@ -22,6 +23,8 @@ class TestLRSchedulerCheckpoint(DistributedTest): def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") + if get_accelerator().device_name() == 'cpu': + pytest.skip("CPU accelerator does not support this test.") config_dict = { "train_batch_size": 2, @@ -35,9 +38,6 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload @@ -51,6 +51,10 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): } } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 if zero_stage == 3: @@ -71,6 +75,8 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") + if get_accelerator().device_name() == 'cpu': + pytest.skip("CPU accelerator does not support this test.") config_dict = { "train_batch_size": 2, @@ -81,9 +87,6 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): "lr": 1e-5 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload @@ -97,6 +100,10 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload): } }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_fp16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 if zero_stage == 3: diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py index 3f853cd5c13a..9e56bf3446fa 100644 --- a/tests/unit/checkpoint/test_mics_optimizer.py +++ b/tests/unit/checkpoint/test_mics_optimizer.py @@ -8,7 +8,7 @@ import deepspeed -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest from unit.simple_model import * from unit.checkpoint.common import * diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py index 0706b7327ce8..89878b5d8fa9 100644 --- a/tests/unit/checkpoint/test_moe_checkpoint.py +++ b/tests/unit/checkpoint/test_moe_checkpoint.py @@ -4,7 +4,7 @@ # DeepSpeed Team from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest from unit.simple_model import * @@ -33,10 +33,10 @@ def test_checkpoint_moe(self, tmpdir, ep_size): tmpdir=tmpdir, load_optimizer_states=True, load_lr_scheduler_states=False, - fp16=config_dict["fp16"]["enabled"], empty_tag=True, base_optimizers=optimizers, - seq_dataloader=True) + seq_dataloader=True, + dtype=torch.float16) @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)]) def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states): @@ -77,7 +77,7 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states): tmpdir=tmpdir, load_optimizer_states=load_optim_states, load_lr_scheduler_states=False, - fp16=config_dict["fp16"]["enabled"], empty_tag=True, base_optimizers=optimizers, - seq_dataloader=True) + seq_dataloader=True, + dtype=torch.float16) diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py index 9cb8c4286880..bcff7f5e3072 100644 --- a/tests/unit/checkpoint/test_other_optimizer.py +++ b/tests/unit/checkpoint/test_other_optimizer.py @@ -19,6 +19,8 @@ class TestOtherOptimizerCheckpoint(DistributedTest): @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible") def test_checkpoint_unfused_optimizer(self, tmpdir): + #if not get_accelerator().is_fp16_supported(): + # pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -29,9 +31,6 @@ def test_checkpoint_unfused_optimizer(self, tmpdir): } }, "gradient_clipping": 1.0, - "fp16": { - "enabled": True - }, "scheduler": { "type": "OneCycle", "params": { @@ -49,6 +48,10 @@ def test_checkpoint_unfused_optimizer(self, tmpdir): } } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_fp16_supported(): + config_dict["bf16"] = {"enabled": True} args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 @@ -69,6 +72,8 @@ def test_checkpoint_unfused_optimizer(self, tmpdir): load_optimizer_states=False) def test_checkpoint_fused_optimizer(self, tmpdir): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -81,10 +86,11 @@ def test_checkpoint_fused_optimizer(self, tmpdir): "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True - } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} args = args_from_dict(tmpdir, config_dict) hidden_dim = 10 @@ -129,4 +135,4 @@ def test_checkpoint_fp32_optimizer(self, tmpdir): models=models, hidden_dim=hidden_dim, tmpdir=tmpdir, - fp16=False) + dtype=torch.float32) diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py index 99f1ba2ec433..c6c228ccada7 100644 --- a/tests/unit/checkpoint/test_pipeline.py +++ b/tests/unit/checkpoint/test_pipeline.py @@ -58,10 +58,10 @@ def test_checkpoint_pipe_engine(self, zero_stage, tmpdir): models=models, hidden_dim=models[0].hidden_dim, tmpdir=tmpdir, - fp16=config_dict['fp16']['enabled'], load_optimizer_states=True, load_lr_scheduler_states=True, - train_batch=True) + train_batch=True, + dtype=torch.float16 if zero_stage > 0 else torch.float32) @pytest.mark.parametrize( "base_topo,test_topo", diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py new file mode 100644 index 000000000000..e0c4f4745043 --- /dev/null +++ b/tests/unit/checkpoint/test_universal_checkpoint.py @@ -0,0 +1,215 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import deepspeed +from types import SimpleNamespace +from torch.utils._pytree import tree_map + +from deepspeed.utils.torch import required_torch_version +from deepspeed.checkpoint import UNIVERSAL_CHECKPOINT_INFO +from deepspeed.checkpoint.ds_to_universal import main as convert_to_universal + +from unit.common import DistributedTest, DistributedFixture +from unit.simple_model import * +from unit.util import bf16_required_version_check + +from unit.checkpoint.common import compare_opt_state_dicts, compare_state_dicts + +import pytest +import deepspeed.comm as dist + + +def get_expected_mismatch_keys(): + # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to + # false positive mismatches in checkpoint state comparisons. + # Newer torch versions store tensor ids as 0, 1, 2, ... + return [] if required_torch_version(min_version=1.4) else ['params'] + + +def maybe_step(t): + return not torch.is_tensor(t) or (t.device.type == 'cpu' and t.numel() == 1) + + +def gather_opt_state(optimizer_state): + + def gather_tensor(t): + + if maybe_step(t): + return t + else: + buffer = [torch.zeros_like(t.flatten()) for _ in range(dist.get_world_size())] + dist.all_gather(buffer, t.flatten()) + return torch.cat(buffer) + + return tree_map(gather_tensor, optimizer_state) + + +def remove_pad_in_opt_state(optimizer_state, num_params): + + def remove_pad(t): + if maybe_step(t): + return t + else: + return t[:num_params] + + return tree_map(remove_pad, optimizer_state) + + +CP_TAG = "test_tag" + + +def init_ds_engine(model, ds_config, use_torch_adam): + + if use_torch_adam: + ds_optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + del ds_config["optimizer"] + model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, optimizer=ds_optimizer) + else: + model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters()) + + return model + + +def train_save_convert(ds_config, hidden_dim, load_optim, use_torch_adam, dtype, tmpdir): + if dtype == torch.bfloat16 and not bf16_required_version_check(): + return + + test_step = 8 + + model = SimpleModel(hidden_dim) + model = init_ds_engine(model, ds_config, use_torch_adam) + data_loader = random_dataloader(model=model, + total_samples=test_step, + hidden_dim=hidden_dim, + device=model.device, + dtype=dtype) + for batch in data_loader: + loss = model(batch[0], batch[1]) + model.backward(loss) + model.step() + + sd = model.optimizer.optimizer.state_dict() if load_optim else None + + client_state = {} + client_state[UNIVERSAL_CHECKPOINT_INFO] = {} + client_state['iteration'] = test_step + model.save_checkpoint(tmpdir, tag=CP_TAG, client_state=client_state) + + cp_dir = os.path.join(tmpdir, CP_TAG) + univ_cp_dir = f"{cp_dir}_universal" + + args = SimpleNamespace(input_folder=cp_dir, + output_folder=univ_cp_dir, + num_extract_workers=1, + num_merge_workers=1, + keep_temp_folder=False, + strict=True) + + dist.barrier() + if dist.get_rank() == 0: + convert_to_universal(args) + + model_state = model.state_dict() + optimizer_state = None + if load_optim: + optimizer_state = gather_opt_state(model.optimizer.optimizer.state_dict()) + + if dist.get_rank() == 0: + torch.save((model_state, optimizer_state), os.path.join(tmpdir, "baseline_state.pt")) + + dist.barrier() + + return model, sd + + +@pytest.fixture +def ds_config(zero_stage, dtype): + ds_config = { + "train_batch_size": 8, + "optimizer": { + "type": 'Adam' + }, + "zero_optimization": { + "stage": zero_stage, + } + } + if dtype == torch.float16: + ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif dtype == torch.bfloat16: + ds_config["bf16"] = {"enabled": True} + return ds_config + + +class _baseline(DistributedFixture): + world_size = None + + def run(self, tmpdir, ds_config, zero_stage, dtype, load_optim, use_torch_adam): + hidden_dim = 10 + train_save_convert(ds_config, hidden_dim, load_optim, use_torch_adam, dtype, tmpdir) + + +class baseline_ws2(_baseline): + world_size = 2 + + +class baseline_ws4(_baseline): + world_size = 4 + + +@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16, torch.float32]) +@pytest.mark.parametrize("zero_stage", [1]) +@pytest.mark.parametrize("use_torch_adam", [False, True]) +@pytest.mark.parametrize("load_optim", [False, True]) +class TestZeROUniversalCheckpointDP(DistributedTest): + + def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam): + if dtype == torch.bfloat16 and not bf16_required_version_check(): + pytest.skip( + " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" + ) + + hidden_dim = 10 + loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt") + + ds_config["checkpoint"] = {"load_universal": True} + univ_model = SimpleModel(hidden_dim) + univ_model = init_ds_engine(univ_model, ds_config, use_torch_adam) + univ_model.load_checkpoint(tmpdir, tag=f"{CP_TAG}_universal", load_optimizer_states=load_optim) + + model_state = univ_model.state_dict() + compare_state_dicts(model_state, loaded_model_state) + + if load_optim: + optimizer_state = gather_opt_state(univ_model.optimizer.optimizer.state_dict()) + # padding sizes may differ when dp sizes are different + param_count = sum(p.numel() for p in univ_model.parameters()) + optimizer_state = remove_pad_in_opt_state(optimizer_state, param_count) + loaded_optimizer_state = remove_pad_in_opt_state(loaded_optimizer_state, param_count) + + compare_opt_state_dicts(optimizer_state, loaded_optimizer_state, get_expected_mismatch_keys()) + + # Run training again to verify that the optimizer has necessary states + test_step = 8 + data_loader = random_dataloader(model=univ_model, + total_samples=test_step, + hidden_dim=hidden_dim, + device=univ_model.device, + dtype=dtype) + for batch in data_loader: + loss = univ_model(batch[0], batch[1]) + univ_model.backward(loss) + univ_model.step() + + @pytest.mark.world_size(2) + def test_dp_world_size_2to2(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam): + self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam) + + @pytest.mark.world_size(2) + def test_dp_world_size_4to2(self, baseline_ws4, tmpdir, dtype, ds_config, load_optim, use_torch_adam): + self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam) + + @pytest.mark.world_size(4) + def test_dp_world_size_2to4(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam): + self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam) diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py index 0b9efb3ec462..84b4eca6e2ca 100644 --- a/tests/unit/checkpoint/test_zero_optimizer.py +++ b/tests/unit/checkpoint/test_zero_optimizer.py @@ -8,7 +8,7 @@ from deepspeed.ops.op_builder import CPUAdamBuilder from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest, DistributedFixture from unit.simple_model import * @@ -28,15 +28,15 @@ def test_pipeline_checkpoint_loading(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, "pipeline_loading_checkpoint": True, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 with deepspeed.zero.Init(): @@ -64,16 +64,16 @@ def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_op "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "wall_clock_breakdown": True, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 if zero_stage == 3: @@ -104,14 +104,15 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, ada "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": zero_stage, "cpu_offload": use_cpu_offload } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 if zero_stage == 3: @@ -134,11 +135,11 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage): "stage": zero_stage }, "zero_allow_untested_optimizer": True, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)] optimizers = [HybridStateOptimizer(model.parameters()) for model in models] @@ -152,19 +153,21 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage): @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) def test_load_module_only(self, tmpdir, zero_stage): + if zero_stage == 0 and get_accelerator().device_name() == "cpu": + pytest.skip("CPU Accelerator does not support this test") config_dict = { "train_batch_size": 2, "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 if zero_stage == 3: @@ -185,15 +188,15 @@ def run(self, class_tmpdir, elastic_save, load_optim): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": 2, "elastic_checkpoint": elastic_save } } + if get_accelerator().is_fp16_supported(): + ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + ds_config["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -221,15 +224,15 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": 2, "elastic_checkpoint": elastic_save } } + if get_accelerator().is_fp16_supported(): + ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + ds_config["bf16"] = {"enabled": True} hidden_dim = 10 # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to @@ -240,7 +243,11 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l model, _, _, _ = deepspeed.initialize(config=ds_config, model=models[0], model_parameters=models[0].parameters()) - data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device) + run_steps = 8 + data_loader = random_dataloader(model=model, + total_samples=run_steps, + hidden_dim=hidden_dim, + device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) @@ -274,15 +281,15 @@ def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir, "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": 2, "elastic_checkpoint": elastic_load } } + if get_accelerator().is_fp16_supported(): + ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + ds_config["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -305,14 +312,14 @@ def test_immediate_save_load(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -325,30 +332,27 @@ def test_immediate_save_load(self, tmpdir, zero_stage): @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3]) def test_load_immediate_save(self, tmpdir, zero_stage): + if zero_stage == 0 and get_accelerator().device_name() == "cpu": + pytest.skip("CPU Accelerator does not support this test") config_dict = { "train_batch_size": 4, "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) # 1. pretrain a model and save it - dtype = torch.half ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) - data_loader = random_dataloader(model=ds_model, - total_samples=1, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=dtype) + data_loader = random_dataloader(model=ds_model, total_samples=1, hidden_dim=hidden_dim, device=ds_model.device) for _, batch in enumerate(data_loader): loss = ds_model(batch[0], batch[1]) ds_model.backward(loss) @@ -371,10 +375,6 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, "stage3_gather_fp16_weights_on_model_save": True, @@ -383,6 +383,10 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage): "train_micro_batch_size_per_gpu": 1, "train_batch_size": 4, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -391,11 +395,7 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage): # So we config grad_accum=2 and step only once and save_16bit_model ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None) - data_loader = random_dataloader(model=ds_model, - total_samples=2, - hidden_dim=hidden_dim, - device=ds_model.device, - dtype=torch.half) + data_loader = random_dataloader(model=ds_model, total_samples=2, hidden_dim=hidden_dim, device=ds_model.device) batch = next(iter(data_loader)) loss = ds_model(batch[0], batch[1]) @@ -429,15 +429,15 @@ def test_load_optimizer_state(self, tmpdir, zero_stage): "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "wall_clock_breakdown": True, "zero_optimization": { "stage": zero_stage } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 with deepspeed.zero.Init(enabled=zero_stage == 3): @@ -460,13 +460,14 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage): "weight_decay": 3e-7 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": zero_stage } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 with deepspeed.zero.Init(enabled=zero_stage == 3): @@ -481,14 +482,14 @@ def test_load_module_only(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 with deepspeed.zero.Init(enabled=zero_stage == 3): @@ -504,14 +505,14 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleFrozenModel(hidden_dim, empty_grad=False) @@ -552,14 +553,14 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage): "optimizer": { "type": 'Adam' }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleFrozenModel(hidden_dim, empty_grad=False) diff --git a/tests/unit/common.py b/tests/unit/common.py index 76bebf6b725a..1fd83de81f02 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -23,7 +23,7 @@ from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker # Worker timeout for tests that hang -DEEPSPEED_TEST_TIMEOUT = 600 +DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600')) def is_rocm_pytorch(): @@ -81,6 +81,11 @@ def set_accelerator_visible(): match = re.search('Device Type.*GPU', line) if match: num_accelerators += 1 + elif get_accelerator().device_name() == 'hpu': + hl_smi = subprocess.check_output(['hl-smi', "-L"]) + num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode()) + num_accelerators = sorted(num_accelerators, key=int) + os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators) elif get_accelerator().device_name() == 'npu': npu_smi = subprocess.check_output(['npu-smi', 'info', '-l']) num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip()) @@ -90,7 +95,10 @@ def set_accelerator_visible(): subprocess.check_output('cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True)) num_accelerators = cpu_sockets - cuda_visible = ",".join(map(str, range(num_accelerators))) + if isinstance(num_accelerators, list): + cuda_visible = ",".join(num_accelerators) + else: + cuda_visible = ",".join(map(str, range(num_accelerators))) # rotate list based on xdist worker id, example below # wid=0 -> ['0', '1', '2', '3'] @@ -149,6 +157,12 @@ def _get_fixture_kwargs(self, request, func): def _launch_daemonic_procs(self, num_procs): # Create process pool or use cached one master_port = None + + if get_accelerator().device_name() == 'hpu': + if self.reuse_dist_env: + print("Ignoring reuse_dist_env for hpu") + self.reuse_dist_env = False + if self.reuse_dist_env: if num_procs not in self._pool_cache: self._pool_cache[num_procs] = mp.Pool(processes=num_procs) @@ -169,9 +183,10 @@ def _launch_daemonic_procs(self, num_procs): # usually means an environment error and the rest of tests will # hang (causing super long unit test runtimes) pytest.exit("Test hanged, exiting", returncode=1) - - # Tear down distributed environment and close process pools - self._close_pool(pool, num_procs) + finally: + # Regardless of the outcome, ensure proper teardown + # Tear down distributed environment and close process pools + self._close_pool(pool, num_procs) # If we skipped a test, propagate that to this process if any(skip_msgs): @@ -441,3 +456,13 @@ def _get_current_test_func(self, request): def get_test_path(filename): curr_path = Path(__file__).parent return str(curr_path.joinpath(filename)) + + +# fp16 > bf16 > fp32 +def preferred_dtype(): + if get_accelerator().is_fp16_supported(): + return torch.float16 + elif get_accelerator().is_bf16_supported(): + return torch.bfloat16 + else: + return torch.float32 diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py index c6e5031349cb..1802c09f33b5 100644 --- a/tests/unit/compression/test_compression.py +++ b/tests/unit/compression/test_compression.py @@ -14,7 +14,7 @@ from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress from deepspeed.compression.helper import convert_conv1d_to_linear from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5), diff --git a/tests/unit/compression/test_dequantization.py b/tests/unit/compression/test_dequantization.py index 692f4cef97d7..8446904754b3 100644 --- a/tests/unit/compression/test_dequantization.py +++ b/tests/unit/compression/test_dequantization.py @@ -7,8 +7,9 @@ import os import torch +import pytest from unit.common import DistributedTest -from deepspeed.ops.op_builder import InferenceBuilder +import deepspeed from deepspeed.accelerator import get_accelerator @@ -18,7 +19,11 @@ def init(self): local_rank = int(os.getenv("LOCAL_RANK", "0")) self.device = torch.device(get_accelerator().device_name(local_rank)) - self.dequantize_func = InferenceBuilder().load().dequantize_fp16 + from deepspeed.ops.op_builder import InferenceBuilder + if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]: + pytest.skip("InferenceBuilder is not implemented") + else: + self.dequantize_func = InferenceBuilder().load().dequantize_fp16 def run_dequantize_test(self, M, N, num_groups): weight = torch.randint(-255, 255, (M, N)).to(dtype=torch.int8, device=self.device) diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py index a49ec595a420..63633a51914b 100644 --- a/tests/unit/elasticity/test_elastic.py +++ b/tests/unit/elasticity/test_elastic.py @@ -9,7 +9,7 @@ from deepspeed.git_version_info import version as ds_version import os from unit.simple_model import SimpleModel -from deepspeed.ops.op_builder import FusedAdamBuilder +from deepspeed.ops.op_builder import FusedAdamBuilder, FusedLambBuilder if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]: pytest.skip("This op had not been implemented on this system.", allow_module_level=True) @@ -183,6 +183,8 @@ class TestNonElasticBatchParamsWithOverride(DistributedTest): world_size = 2 def test(self): + if not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME]: + pytest.skip("This op had not been implemented on this system.", allow_module_level=True) config_dict = { "train_batch_size": 2, "steps_per_print": 1, diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py index fd6a8e5ad2e1..77b51fcd5814 100644 --- a/tests/unit/inference/quantization/test_intX_quantization.py +++ b/tests/unit/inference/quantization/test_intX_quantization.py @@ -11,7 +11,7 @@ from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer from deepspeed.inference.quantization.layers import QuantizedLinear -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from transformers.models.opt.modeling_opt import OPTDecoderLayer from transformers import AutoConfig, OPTConfig, AutoModel import pytest diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index f3056a225a9b..4e203a71db60 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -653,8 +653,15 @@ def no_pool_bootstrap_stderr(f, xs, iters): setattr(lm, model_family, getattr(lm, model_family).half().to(device)) lm._device = device else: - lm = lm_eval.models.get_model(model_family).create_from_arg_string( - f"pretrained={model_name}", {"device": get_accelerator().device_name()}) + if get_accelerator().device_name() == 'hpu': + #lm_eval not supporting HPU device, so get model with CPU and move it to HPU. + lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}", + {"device": "cpu"}) + setattr(lm, model_family, getattr(lm, model_family).to(device)) + lm._device = device + else: + lm = lm_eval.models.get_model(model_family).create_from_arg_string( + f"pretrained={model_name}", {"device": get_accelerator().device_name()}) get_accelerator().synchronize() start = time.time() diff --git a/tests/unit/launcher/test_user_args.py b/tests/unit/launcher/test_user_args.py index 99afd0f2cfa7..b86be4dfe74c 100644 --- a/tests/unit/launcher/test_user_args.py +++ b/tests/unit/launcher/test_user_args.py @@ -43,7 +43,9 @@ def cmd(user_script_fp, prompt, multi_node): '''I'm going to tell them "DeepSpeed is the best"''' ]) @pytest.mark.parametrize("multi_node", [True, False]) -def test_user_args(cmd): +def test_user_args(cmd, multi_node): + if multi_node and get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() assert "ARG PARSE SUCCESS" in out.decode("utf-8"), f"User args not parsed correctly: {err.decode('utf-8')}" diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py index 824ecea5f144..cca1ef3584ad 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py @@ -13,7 +13,7 @@ from deepspeed.accelerator import get_accelerator from unit.common import DistributedTest, DistributedFixture from unit.megatron_model import get_gpt2_model, get_megatron_version -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13), reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13') diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py index b500b9d857a5..e50fd18577b1 100644 --- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py +++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py @@ -15,7 +15,7 @@ from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe from deepspeed.utils import RepeatingLoader from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13), reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13') diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py index 310a0df16381..d39f9fe3d651 100644 --- a/tests/unit/moe/test_moe.py +++ b/tests/unit/moe/test_moe.py @@ -9,8 +9,47 @@ import gc from unit.common import DistributedTest from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader +import deepspeed.comm as dist +from deepspeed import get_accelerator +from deepspeed.moe.sharded_moe import top1gating from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version + + +@pytest.mark.parametrize("zero_stage", [0, 1, 2]) +class TestSimpleMoE(DistributedTest): + world_size = 2 + + def test(self, zero_stage): + if not required_torch_version(min_version=1.8): + pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") + + config_dict = { + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + }, + "zero_optimization": { + "stage": zero_stage + } + } + # should automatically create moe param groups in deepspeed backend + hidden_dim = 16 + model = SimpleMoEModel(hidden_dim=hidden_dim, ep_size=1) + model, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model) + data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) + + for n, batch in enumerate(data_loader): + loss = model(batch[0], batch[1]) + model.backward(loss) + model.step() @pytest.mark.parametrize("ep_size", [2, 4]) @@ -132,3 +171,23 @@ def test(self, ep_size, use_residual): loss = model(batch[0], batch[1]) model.backward(loss) model.step() + + +class TestTopk(DistributedTest): + world_size = 2 + + def test(self): + device = get_accelerator().current_device() + if dist.get_rank() == 0: + logits = torch.rand(2, 2, device=device) + elif dist.get_rank() == 1: + logits = torch.rand(10, 2, device=device) + + output = top1gating(logits=logits, + capacity_factor=1, + min_capacity=0, + used_token=None, + noisy_gate_policy=None, + drop_tokens=False, + use_rts=True, + use_tutel=False) diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py index 0069c674690c..eb4668015c01 100644 --- a/tests/unit/moe/test_moe_tp.py +++ b/tests/unit/moe/test_moe_tp.py @@ -7,7 +7,7 @@ import deepspeed import pytest from unit.common import DistributedTest -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.moe.layer import MoE diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py index e84215fb4e95..d7a5f9a46b97 100644 --- a/tests/unit/multi_output_model.py +++ b/tests/unit/multi_output_model.py @@ -4,6 +4,7 @@ # DeepSpeed Team import torch +from .common import preferred_dtype class MultiOutputModel(torch.nn.Module): @@ -28,8 +29,11 @@ def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, ta batch_size = model.train_micro_batch_size_per_gpu() train_data = [ - torch.full(size=(total_samples, hidden_dim), fill_value=x, device=device, dtype=torch.half, requires_grad=True) - for x in inputs + torch.full(size=(total_samples, hidden_dim), + fill_value=x, + device=device, + dtype=preferred_dtype(), + requires_grad=True) for x in inputs ] train_label = [torch.empty(total_samples, device=device, dtype=torch.long).fill_(y) for y in targets] diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py index 43f7b471e2ae..48e5fbbe7475 100644 --- a/tests/unit/ops/accelerators/test_accelerator_backward.py +++ b/tests/unit/ops/accelerators/test_accelerator_backward.py @@ -16,10 +16,6 @@ from unit.modelingpreln import BertEncoder as BertEncoderPreln from unit.common import DistributedTest, is_rocm_pytorch -#if not deepspeed.ops.__installed_ops__['transformer']: -#pytest.skip( -# "transformer kernels are temporarily disabled because of unexplained failures", -# allow_module_level=True) if torch.half not in get_accelerator().supported_dtypes(): pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True) diff --git a/tests/unit/ops/fp_quantizer/test_fp_quant.py b/tests/unit/ops/fp_quantizer/test_fp_quant.py new file mode 100644 index 000000000000..101f4cd69811 --- /dev/null +++ b/tests/unit/ops/fp_quantizer/test_fp_quant.py @@ -0,0 +1,94 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch +import deepspeed + +from deepspeed.ops.fp_quantizer import FP_Quantize +from deepspeed.ops.op_builder import FPQuantizerBuilder + +if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]: + pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True) + +# warning: this import silently JIT builds a set of kernels and may take a minute +from qtorch.quant import float_quantize + + +def qtorch_quantize(input, exp_bits=4, man_bits=3, rounding="nearest", group_size=1024): + ori_dt = input.dtype + ori_shape = input.shape + last_dim = group_size + input = input.view(-1, last_dim) + + q_bits = exp_bits + man_bits + 1 + input_to_float = input.float() + if q_bits == 8: + q_range = 480. + elif q_bits == 6: + q_range = 28. + elif q_bits == 12: + q_range = 510. + else: + assert (0), \ + "Please specify the right quantization range for the selected precision!" + input_max = input_to_float.abs().amax(dim=-1, keepdim=True) + return ((float_quantize(input_to_float / input_max * q_range, exp_bits, man_bits, rounding=rounding) * \ + input_max / q_range).to(ori_dt)).reshape(ori_shape) + + +@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) +def test_fp_quant_meta(dtype): + group_size = 128 + q_bits = 8 + exp_bits = 4 + man_bits = 3 + + fpq = FP_Quantize(group_size=group_size) + for i in range(10): + x = torch.rand(4, 1024, dtype=dtype, device='cuda') + + ds_x = x.clone() + x_quantized, meta_tensor = fpq.quantize(ds_x, q_bits=q_bits, return_meta_tensor=True) + x_dequantized = fpq.dequantize(x_quantized, q_bits=q_bits, scale=meta_tensor) + + qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=group_size) + qtorch_error = (qtorch_out - x).abs().sum() / x.numel() + ds_error = (x_dequantized - x).abs().sum() / x.numel() + + assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}" + + +@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"]) +@pytest.mark.parametrize("q_bits", [8, 6, 12], ids=["qbits8", "qbits6", "qbits12"]) +def test_fp_quant(dtype, q_bits): + group_size = 128 + fpq = FP_Quantize(group_size=group_size) + + for i in range(10): + x = torch.rand(4, 1024, dtype=dtype, device='cuda') + + ds_x = x.clone() + x_quantized = fpq.quantize(ds_x, q_bits=q_bits) + x_dequantized = fpq.dequantize(x_quantized, q_bits=q_bits) + + if q_bits == 8: + exp_bits = 4 + man_bits = 3 + elif q_bits == 6: + exp_bits = 3 + man_bits = 2 + elif q_bits == 12: + exp_bits = 4 + man_bits = 7 + else: + raise ValueError(f"unknown {q_bits=}") + + qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=group_size) + + qtorch_error = (qtorch_out - x).abs().sum() / x.numel() + ds_error = (x_dequantized - x).abs().sum() / x.numel() + + assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}" diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py index bbcb01b489f4..c72deecf287f 100644 --- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py +++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py @@ -9,7 +9,7 @@ from deepspeed.profiling.flops_profiler import get_model_profile from unit.simple_model import SimpleModel, random_dataloader from unit.common import DistributedTest -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator if torch.half not in get_accelerator().supported_dtypes(): diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py index 0232457a4f9c..22a61003b31e 100644 --- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py +++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py @@ -62,6 +62,8 @@ def _match_outputs(ref, tgt): def _test_activation_checkpoint(module, *inputs): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") # Move to device module.to(get_accelerator().device_name()) @@ -82,6 +84,8 @@ def _test_activation_checkpoint(module, *inputs): def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") # Move to device module.to(get_accelerator().device_name()) diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py index d9ac79619bd3..17b2ffbb9d29 100644 --- a/tests/unit/runtime/comm/test_coalesced_collectives.py +++ b/tests/unit/runtime/comm/test_coalesced_collectives.py @@ -7,9 +7,11 @@ """ import torch +import deepspeed import deepspeed.comm as dist from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce from deepspeed.accelerator import get_accelerator +import pytest from unit.common import DistributedTest @@ -68,6 +70,9 @@ class TestAllToAllQuantReduceFallback(DistributedTest): def test_1d_tensor(self): # case 1: 1D tensor input = torch.zeros((10, ), dtype=torch.half, device=get_accelerator().current_device_name()) + from deepspeed.ops.op_builder import QuantizerBuilder + if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]: + pytest.skip("QuantizerBuilder is not implemented") output = all_to_all_quant_reduce([input], {})[0] if dist.get_rank() == 0: @@ -80,6 +85,9 @@ def test_1d_tensor(self): def test_non_divisible(self): # case 2: tensor size not divisible by global_world_size input = torch.zeros((7, 7), dtype=torch.half, device=get_accelerator().current_device_name()) + from deepspeed.ops.op_builder import QuantizerBuilder + if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]: + pytest.skip("QuantizerBuilder is not implemented") output = all_to_all_quant_reduce([input], {})[0] if dist.get_rank() == 0: diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index 98a7c28c6a28..d1830534f6ea 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -8,7 +8,7 @@ import deepspeed from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest @@ -34,6 +34,8 @@ def base_config(): "backend": "inductor" } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict @@ -72,6 +74,8 @@ def _run_model(self, engine): @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_custom_function(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") test_value = 10 engine = self._init_engine(base_config, test_value) diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index 910f32db1c96..7568c27e3ed2 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -7,7 +7,8 @@ import torch from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version +from deepspeed.accelerator import get_accelerator from unit.runtime.compile.util import compare_loss from unit.common import DistributedTest @@ -29,6 +30,8 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): pytest.skip( " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU does not support this test yet") if offload_device == OffloadDeviceEnum.nvme: if zero_stage != 3: @@ -52,6 +55,8 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): } } + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index 5f1c01b86852..601adae58884 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -9,7 +9,7 @@ from unit.simple_model import SimpleModel import deepspeed from deepspeed.accelerator import get_accelerator -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from unit.common import DistributedTest @@ -50,6 +50,9 @@ def base_config(): "backend": "inductor" } } + + if get_accelerator().device_name() == 'hpu': + config_dict['compile']['backend'] = 'hpu_backend' return config_dict @@ -74,12 +77,16 @@ def _run_model(self, engine): @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_compile(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") engine = self._init_engine(base_config) self._run_model(engine) assert engine.is_compiled @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_custom_backend(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") global custom_backend_called custom_backend_called = False @@ -89,12 +96,16 @@ def test_custom_backend(self, base_config): assert custom_backend_called def test_compile_disabled(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") base_config["compile"]["enabled"] = False engine = self._init_engine(base_config) self._run_model(engine) @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_compile_kwargs(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") base_config["compile"]["kwargs"] = {"mode": "default"} engine = self._init_engine(base_config) self._run_model(engine) @@ -102,6 +113,8 @@ def test_compile_kwargs(self, base_config): @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_set_compile_kwargs(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") engine = self._init_engine(base_config) engine.set_torch_compile_kwargs({"mode": "default"}) self._run_model(engine) @@ -109,6 +122,8 @@ def test_set_compile_kwargs(self, base_config): @pytest.mark.skipif(not deepspeed.is_compile_supported(), reason="torch.compile is not supported") def test_set_compiler_fn(self, base_config): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") global custom_compler_fn_called custom_compler_fn_called = False diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py index ba795a853be0..1be2d73ef4c0 100644 --- a/tests/unit/runtime/half_precision/onebit/test_onebit.py +++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py @@ -17,7 +17,7 @@ from unit.common import DistributedTest from unit.simple_model import SimpleModel, random_dataloader from unit.alexnet_model import AlexNetPipe, train_cifar -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator PipeTopo = PipeDataParallelTopology @@ -33,12 +33,18 @@ pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5", allow_module_level=True) +if get_accelerator().device_name() == 'hpu': + pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) class TestOneBitAdamBasic(DistributedTest): world_size = 2 def test(self, dtype): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -80,6 +86,8 @@ class TestOneBitAdamExpAvgMask(DistributedTest): world_size = 2 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -144,6 +152,8 @@ class TestOneBitAdamCheckpointing(DistributedTest): world_size = 2 def test(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -293,6 +303,8 @@ def test(self, tmpdir): assert optimizer_3.optimizer.adam_freeze_key is False def test_overflow(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -343,6 +355,8 @@ class TestOneBitAdamFP16Pipeline(DistributedTest): world_size = 4 def test(self, topo_config): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 4, "grandient_accumulation_steps": 1, @@ -388,6 +402,8 @@ class TestZeroOneAdamBasic(DistributedTest): world_size = 2 def test(self, dtype): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -432,6 +448,8 @@ class TestZeroOneAdamExpAvgMask(DistributedTest): world_size = 2 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -499,6 +517,8 @@ class TestZeroOneAdamCheckpointing(DistributedTest): world_size = 2 def test(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -647,6 +667,8 @@ def test(self, tmpdir): assert "server_error" not in v, f"Incorrect server error" def test_overflow(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -700,6 +722,8 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest): world_size = 4 def test(self, topo_config): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 4, "grandient_accumulation_steps": 1, @@ -748,6 +772,8 @@ class TestOneBitLambBasic(DistributedTest): world_size = 2 def test(self, dtype): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -795,6 +821,8 @@ class TestOneBitLampExpAvgMask(DistributedTest): world_size = 2 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -864,6 +892,8 @@ class TestOneBitLambCheckpointing(DistributedTest): world_size = 2 def test(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -1030,6 +1060,8 @@ def test(self, tmpdir): assert optimizer_3.optimizer.lamb_freeze_key is False def test_overflow(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -1086,6 +1118,8 @@ class TestOneBitLambFP16Pipeline(DistributedTest): world_size = 4 def test(self, topo_config): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 4, "grandient_accumulation_steps": 1, @@ -1131,6 +1165,8 @@ class TestCompressedAllReduceBasic(DistributedTest): world_size = 2 def test(self, tmpdir): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") from deepspeed.runtime.comm.nccl import NcclBackend size = dist.get_world_size() diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py index 3f551fb0fd4a..0af14abc3be5 100644 --- a/tests/unit/runtime/half_precision/test_bf16.py +++ b/tests/unit/runtime/half_precision/test_bf16.py @@ -12,6 +12,7 @@ from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader from unit.util import bf16_required_version_check from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest): @@ -287,8 +288,8 @@ def test(self, stage=2): model.step() -@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"]) -@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bfp16", "default"]) +@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"]) +@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bf16", "default"]) class TestZeroDtypeCocktail(DistributedTest): world_size = 2 @@ -299,7 +300,11 @@ def test(self, comp_type, comm_type): " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) - type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"} + if comp_type == torch.float16 or comm_type == torch.float16: + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + + type_str = {torch.float16: "fp16", torch.bfloat16: "bf16"} config_dict = { "train_micro_batch_size_per_gpu": 2, diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py index 2a58fd6b4a57..f350e08e68a7 100644 --- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py +++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py @@ -5,6 +5,8 @@ import torch import deepspeed +from deepspeed.accelerator import get_accelerator +import pytest import numpy as np from unit.common import DistributedTest from unit.simple_model import SimpleModel @@ -22,6 +24,9 @@ class TestFused(DistributedTest): world_size = 1 def test_no_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -57,6 +62,8 @@ def test_no_overflow(self): expected_loss_scale *= 2 def test_all_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -90,6 +97,8 @@ def test_all_overflow(self): assert optim.cur_iter == (i + 1) def test_some_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -147,6 +156,8 @@ class TestUnfused(DistributedTest): world_size = 1 def test_no_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -181,6 +192,8 @@ def test_no_overflow(self): expected_loss_scale *= 2 def test_all_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -217,6 +230,8 @@ def test_all_overflow(self): assert optim.cur_iter == (i + 1) def test_some_overflow(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py index 3d5e18b46502..5b300053d2a8 100644 --- a/tests/unit/runtime/half_precision/test_fp16.py +++ b/tests/unit/runtime/half_precision/test_fp16.py @@ -10,9 +10,10 @@ from deepspeed.ops.adam import FusedAdam from unit.common import DistributedTest from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader -from deepspeed.runtime.utils import required_torch_version +from deepspeed.utils.torch import required_torch_version from deepspeed.accelerator import get_accelerator from deepspeed.ops.op_builder import CPUAdamBuilder +from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer try: from apex import amp # noqa: F401 # type: ignore @@ -26,6 +27,8 @@ class TestLambFP32GradClip(DistributedTest): world_size = 2 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -56,6 +59,8 @@ class TestLambFP16(DistributedTest): world_size = 2 def test__basic(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -81,6 +86,8 @@ def test__basic(self): model.step() def test_empty_grad(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -143,6 +150,8 @@ class TestAdamwFP16Basic(DistributedTest): world_size = 1 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 @@ -160,6 +169,8 @@ class TestFP16OptimizerForMoE(DistributedTest): world_size = 2 def test_unfused_gradnorm(self, monkeypatch): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if not required_torch_version(min_version=1.8): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") @@ -188,6 +199,8 @@ def mock_unscale_and_clip_grads(total_norm, apply_scale=True): engine.step() def test_fused_gradnorm(self, monkeypatch): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if not required_torch_version(min_version=1.8): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") @@ -203,8 +216,10 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True) # initialize MoE model = SimpleMoEModel(hidden_dim, ep_size=2) + param_group = {'params': [p for p in model.parameters()], 'name': 'random-unique-name'} + params = split_params_into_different_moe_groups_for_optimizer(param_group) # optimizer = torch.optim.AdamW(params=model.parameters()) - optimizer = FusedAdam(params=model.parameters()) + optimizer = FusedAdam(params=params) engine, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer, @@ -218,6 +233,8 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True) @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)]) def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if not required_torch_version(min_version=1.8): pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly") @@ -262,6 +279,8 @@ class TestAdamwFP16EmptyGrad(DistributedTest): world_size = 1 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}} hidden_dim = 10 @@ -281,6 +300,8 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest): world_size = 1 def test(self, zero_stage, use_cpu_offload): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -332,6 +353,8 @@ class TestZeroStaticScale(DistributedTest): world_size = 1 def test(self, zero_stage, use_cpu_offload, hidden_dim=4): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -375,6 +398,8 @@ class TestZeroAllowUntestedOptimizer(DistributedTest): world_size = 1 def test(self, zero_stage, use_cpu_offload): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -408,6 +433,8 @@ class TestZeroEmptyPartition(DistributedTest): world_size = 3 def test(self, zero_stage, use_cpu_offload): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]: pytest.skip("cpu-adam is not compatible") @@ -454,6 +481,8 @@ class TestAmp(DistributedTest): world_size = 2 def test_adam_basic(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = {"train_batch_size": 2, "steps_per_print": 1, "amp": {"enabled": True}} hidden_dim = 10 @@ -467,6 +496,8 @@ def test_adam_basic(self): model.step() def test_lamb_basic(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -492,6 +523,8 @@ def test_lamb_basic(self): model.step() def test_adam_O2(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -518,6 +551,8 @@ def test_adam_O2(self): model.step() def test_adam_O2_empty_grad(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -550,6 +585,8 @@ class TestZeroSupportedClientOptimizer(DistributedTest): world_size = 1 def test(self, zero_stage, optimizer_constructor): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -571,6 +608,8 @@ class TestZero2ReduceScatterOff(DistributedTest): world_size = 2 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -610,6 +649,8 @@ class TestFP16AdamTypes(DistributedTest): world_size = 1 def test(self, adam_type, torch_impl): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -642,6 +683,8 @@ class TestZero3LazyScatter(DistributedTest): world_size = 1 def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -677,6 +720,8 @@ class TestZeroEmptyGrad(DistributedTest): world_size = 1 def test(self, stage): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict = { "train_batch_size": 1, "steps_per_print": 1, diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py index 92da2257bdb0..badd0bcee549 100644 --- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py +++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py @@ -4,9 +4,14 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest from unit.util import skip_on_arch +from deepspeed.accelerator import get_accelerator + +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) class Model(torch.nn.Module): diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py index 0689adc08670..6338a16b8dbb 100644 --- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py +++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py @@ -4,11 +4,15 @@ # DeepSpeed Team import torch +import pytest import deepspeed from unit.common import DistributedTest - +from deepspeed.accelerator import get_accelerator import deepspeed.utils.groups as groups +if get_accelerator().device_name() == 'hpu': + pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True) + class Model(torch.nn.Module): diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py index b9bd9c3aa56e..87fb49aad830 100644 --- a/tests/unit/runtime/test_data_efficiency.py +++ b/tests/unit/runtime/test_data_efficiency.py @@ -7,6 +7,7 @@ import os import deepspeed from deepspeed.accelerator import get_accelerator +import pytest from unit.common import DistributedTest from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_dataloader, random_dataset @@ -53,6 +54,8 @@ class TestDataEfficiency(DistributedTest): world_size = 2 def test_curriculum_learning(self): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -64,11 +67,6 @@ def test_curriculum_learning(self): } }, "gradient_clipping": 1.0, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 16 - }, "data_efficiency": { "enabled": True, "seed": 1234, @@ -98,6 +96,10 @@ def test_curriculum_learning(self): } } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} def data_post_process(data, data_sampler_state_dict): assert 'dummy_metric' in data_sampler_state_dict['current_difficulties'] @@ -105,7 +107,7 @@ def data_post_process(data, data_sampler_state_dict): hidden_dim = 10 model = SimpleModel(hidden_dim) - dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half) + dataset = random_dataset(20, hidden_dim, torch.device('cpu')) model, _, data_loader, _ = deepspeed.initialize(config=config_dict, model=model, training_data=dataset, @@ -128,6 +130,8 @@ class TestLegacyCurriculumScheduler(DistributedTest): world_size = 2 def test_fixed_discrete(self): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -139,11 +143,6 @@ def test_fixed_discrete(self): } }, "gradient_clipping": 1.0, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 16 - }, "curriculum_learning": { "enabled": True, "curriculum_type": "seqlen", @@ -156,6 +155,10 @@ def test_fixed_discrete(self): } } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4} @@ -172,6 +175,8 @@ def test_fixed_discrete(self): assert seqlen == true_seqlen, f"Incorrect curriculum schedule" def test_fixed_linear(self): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") config_dict = { "train_batch_size": 2, "steps_per_print": 1, @@ -183,11 +188,6 @@ def test_fixed_linear(self): } }, "gradient_clipping": 1.0, - "fp16": { - "enabled": True, - "loss_scale": 0, - "initial_scale_power": 16 - }, "curriculum_learning": { "enabled": True, "curriculum_type": "seqlen", @@ -200,6 +200,10 @@ def test_fixed_linear(self): } } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10} diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py index 880282bb7e57..c11c63d04867 100644 --- a/tests/unit/runtime/test_ds_config_dict.py +++ b/tests/unit/runtime/test_ds_config_dict.py @@ -47,9 +47,6 @@ def base_config(): "lr": 0.00015 } }, - "fp16": { - "enabled": True - } } return config_dict @@ -163,11 +160,19 @@ class TestConfigLoad(DistributedTest): world_size = 1 def test_dict(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters()) def test_json(self, base_config, tmpdir): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} config_path = os.path.join(tmpdir, "config.json") with open(config_path, 'w') as fp: json.dump(base_config, fp) @@ -176,6 +181,10 @@ def test_json(self, base_config, tmpdir): model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters()) def test_hjson(self, base_config, tmpdir): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} config_path = os.path.join(tmpdir, "config.json") with open(config_path, 'w') as fp: hjson.dump(base_config, fp) @@ -188,6 +197,10 @@ class TestDeprecatedDeepScaleConfig(DistributedTest): world_size = 1 def test(self, base_config, tmpdir): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} config_path = create_config_from_dict(tmpdir, base_config) parser = argparse.ArgumentParser() args = parser.parse_args(args='') @@ -209,6 +222,10 @@ class TestDistInit(DistributedTest): world_size = 1 def test(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -227,6 +244,12 @@ class TestInitNoOptimizer(DistributedTest): world_size = 1 def test(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} + if get_accelerator().device_name() == "cpu": + pytest.skip("This test timeout with CPU accelerator") del base_config["optimizer"] hidden_dim = 10 @@ -246,6 +269,10 @@ class TestArgs(DistributedTest): world_size = 1 def test_none_args(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} model = SimpleModel(hidden_dim=10) model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device) @@ -253,6 +280,10 @@ def test_none_args(self, base_config): loss = model(batch[0], batch[1]) def test_no_args(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} model = SimpleModel(hidden_dim=10) model, _, _, _ = deepspeed.initialize(model=model, config=base_config) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device) @@ -264,6 +295,10 @@ class TestNoModel(DistributedTest): world_size = 1 def test(self, base_config): + if get_accelerator().is_fp16_supported(): + base_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + base_config["bf16"] = {"enabled": True} model = SimpleModel(hidden_dim=10) with pytest.raises(AssertionError): model, _, _, _ = deepspeed.initialize(model=None, config=base_config) diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py index 8ec9f05a0a17..169096a6d4e5 100644 --- a/tests/unit/runtime/test_ds_initialize.py +++ b/tests/unit/runtime/test_ds_initialize.py @@ -17,7 +17,9 @@ from deepspeed.ops.adam import FusedAdam from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR from deepspeed.runtime.config import ADAM_OPTIMIZER -from deepspeed.runtime.utils import see_memory_usage, required_torch_version +from deepspeed.runtime.utils import see_memory_usage +from deepspeed.utils.torch import required_torch_version +from deepspeed.accelerator import get_accelerator @pytest.mark.parametrize('zero_stage', [0, 3]) @@ -30,9 +32,6 @@ def test(self, zero_stage): ds_config = { 'train_batch_size': self.world_size, - 'fp16': { - 'enabled': True - }, 'zero_optimization': { "stage": zero_stage, "offload_param": { @@ -40,6 +39,10 @@ def test(self, zero_stage): } } } + if get_accelerator().is_fp16_supported(): + ds_config["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + ds_config["bf16"] = {"enabled": True} # 20B test #hidden_dim = 16 * 1024 hidden_dim = 4 @@ -49,11 +52,7 @@ def test(self, zero_stage): see_memory_usage('pre-init', force=True) model, _, _, _ = deepspeed.initialize(model=model, config=ds_config) see_memory_usage('post-init', force=True) - data_loader = random_dataloader(model=model, - total_samples=50, - hidden_dim=hidden_dim, - device=model.device, - dtype=torch.half) + data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for batch in data_loader: model(batch[0], batch[1]) see_memory_usage('post-fwds', force=True) @@ -120,6 +119,9 @@ class TestOptimizerImplementation(DistributedTest): reuse_dist_env = True def test(self, optimizer_extension, model_dtype, grad_accum_dtype): + if not get_accelerator().is_fp16_supported(): + if model_dtype == 'fp16' or grad_accum_dtype == 'fp16': + pytest.skip("fp16 is not supported") if optimizer_extension == 'zero1': zero_stage = 1 elif optimizer_extension == 'zero2': diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py index d9aba419b158..cda0d4f054d3 100644 --- a/tests/unit/runtime/test_multi_output_model.py +++ b/tests/unit/runtime/test_multi_output_model.py @@ -5,8 +5,9 @@ import torch import deepspeed +from deepspeed.accelerator import get_accelerator from pytest import approx -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype from unit.multi_output_model import MultiOutputModel, multi_output_dataloader @@ -28,10 +29,11 @@ def test(self, tmpdir): "lr": 0.00015 } }, - "fp16": { - "enabled": True - } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 weight_value = 0.1 @@ -53,7 +55,7 @@ def test(self, tmpdir): inputs, targets = batch[:midpoint], batch[midpoint:] loss_tuple = model(inputs, targets) - expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device) + expected_loss = torch.tensor(2.302734375, dtype=preferred_dtype(), device=model.device) for loss in loss_tuple: assert loss.shape == torch.Size([]) assert loss.item() == approx(expected_loss.item()) @@ -84,10 +86,11 @@ def test(self, tmpdir): "lr": 0.00015 } }, - "fp16": { - "enabled": True - } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 weight_value = 0.1 @@ -111,7 +114,7 @@ def test(self, tmpdir): loss_tuple = model(inputs, targets) assert len(loss_tuple) == 3 - expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device) + expected_loss = torch.tensor(2.302734375, dtype=preferred_dtype(), device=model.device) for loss in loss_tuple: assert loss.shape == torch.Size([]) diff --git a/tests/unit/runtime/test_mup_optimizers.py b/tests/unit/runtime/test_mup_optimizers.py index ebecf73d416f..7666fa9d1c1f 100644 --- a/tests/unit/runtime/test_mup_optimizers.py +++ b/tests/unit/runtime/test_mup_optimizers.py @@ -10,6 +10,7 @@ from unit.common import DistributedTest from unit.simple_model import SimpleModel, random_dataloader from mup.shape import set_base_shapes +from deepspeed.accelerator import get_accelerator @pytest.mark.parametrize("optimizer, expected_opt_class", [("MuAdam", torch.optim.Adam), @@ -31,14 +32,15 @@ def test(self, optimizer, expected_opt_class, zero_offload): } }, "gradient_clipping": 1.0, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 2, "cpu_offload": zero_offload } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) set_base_shapes(model, None) diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py index 1f602db73b2f..f6da992d5e11 100644 --- a/tests/unit/runtime/test_pld.py +++ b/tests/unit/runtime/test_pld.py @@ -10,6 +10,7 @@ from unit.common import DistributedTest from unit.simple_model import SimpleModel, PLD_SimpleModel, random_dataloader +from deepspeed.accelerator import get_accelerator @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0]) @@ -39,15 +40,16 @@ def test_pld_model(self, theta): "lr": 0.0001 } }, - "fp16": { - "enabled": True - }, "progressive_layer_drop": { "enabled": True, "theta": theta, "gamma": gamma } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = PLD_SimpleModel(hidden_dim, empty_grad=False) @@ -80,15 +82,16 @@ def test_non_pld_model(self): "lr": 0.0001 } }, - "fp16": { - "enabled": True - }, "progressive_layer_drop": { "enabled": True, "theta": theta, "gamma": gamma } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py index aade488fde42..b1d341486e55 100644 --- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py +++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py @@ -9,6 +9,7 @@ from deepspeed.ops.op_builder import CPUAdamBuilder import deepspeed +from deepspeed.accelerator import get_accelerator @pytest.mark.parametrize('ignore_unused_parameters', [False, True]) @@ -36,11 +37,11 @@ def test(self, ignore_unused_parameters): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + else: + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 model = UnusedParametersModel(hidden_dim=hidden_dim) diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py index 5a8af95bb0f8..7262a1b2c998 100644 --- a/tests/unit/runtime/zero/test_zero.py +++ b/tests/unit/runtime/zero/test_zero.py @@ -16,7 +16,7 @@ from torch.nn.parameter import Parameter from torch.nn.utils import skip_init -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype from unit.simple_model import SimpleModel, random_dataloader import deepspeed @@ -71,11 +71,11 @@ def test(self, zero_stage): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim) @@ -91,6 +91,8 @@ class TestZero3RepeatForwardLoop(DistributedTest): world_size = 1 def test(self, mics_enabled, zero_stage=3): + if mics_enabled and get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") # force all params to be partitioned by forcing threshold=0 mics_shard_size = -1 if mics_enabled: @@ -111,11 +113,11 @@ def test(self, mics_enabled, zero_stage=3): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 class AlbertLikeModel(torch.nn.Module): @@ -166,11 +168,11 @@ def test_1_param_group(self, tmpdir, zero_stage, freeze_params): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} class MyModel(torch.nn.Module): @@ -260,11 +262,11 @@ def test_2_param_groups(self, tmpdir, zero_stage, freeze_params): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} class MyModel(torch.nn.Module): @@ -366,11 +368,11 @@ def test(self, allgather_bucket_size, zero_stage=2): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim) @@ -401,11 +403,11 @@ def test(self, zero_stage=2): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim) @@ -625,6 +627,8 @@ def test_param_persistence_threshold(self, param_persistence_threshold): @pytest.mark.parametrize("fp16_enabled", [True, False]) def test_fp16_enabled(self, fp16_enabled): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") self._test(fp16_enabled=fp16_enabled) @pytest.mark.parametrize("contiguous_gradients", [True, False]) @@ -690,11 +694,11 @@ def _test( "lr": 1.0 } }, - "fp16": { - "enabled": fp16_enabled, - "loss_scale": 1.0, - }, } + if get_accelerator().is_fp16_supported(): + cfg["fp16"] = {"enabled": True, "loss_scale": 1.0} + elif get_accelerator().is_bf16_supported(): + cfg["bf16"] = {"enabled": True} if offload_optimizer: cfg["zero_optimization"]["offload_optimizer"] = { @@ -859,11 +863,11 @@ def forward(self, x: Tensor) -> Tensor: "lr": 1.0 } }, - "fp16": { - "enabled": True, - "loss_scale": 1.0, - }, } + if get_accelerator().is_fp16_supported(): + ds_config["fp16"] = {"enabled": True, "loss_scale": 1.0} + elif get_accelerator().is_bf16_supported(): + ds_config["bf16"] = {"enabled": True} with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager): model = LargeParamModel() ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config) @@ -938,24 +942,24 @@ def forward(self, x: Tensor) -> Tensor: "lr": 1.0 } }, - "fp16": { - "enabled": True, - "loss_scale": 1.0, - }, } + if get_accelerator().is_fp16_supported(): + ds_cfg["fp16"] = {"enabled": True, "loss_scale": 1.0} + elif get_accelerator().is_bf16_supported(): + ds_cfg["bf16"] = {"enabled": True} with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager): model = ManyParamModel() ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg) + dtype = preferred_dtype() for _ in range(3): # test multiple iterations to cover prefetching - activations: List[Tensor] = ds_engine( - torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device)) + activations: List[Tensor] = ds_engine(torch.ones((param_sz, ), dtype=dtype, device=ds_engine.device)) assert len(activations) == n_layers partition_sz = math.ceil(param_sz / self.world_size) - expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device) + expected_activations = torch.empty(param_sz, dtype=dtype, device=ds_engine.device) for start_idx in range(0, param_sz, partition_sz): expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank() @@ -1007,11 +1011,11 @@ def __init_weights(self, module): "lr": 1.0 } }, - "fp16": { - "enabled": True, - "loss_scale": 1.0, - }, } + if get_accelerator().is_fp16_supported(): + ds_cfg["fp16"] = {"enabled": True, "loss_scale": 1.0} + elif get_accelerator().is_bf16_supported(): + ds_cfg["bf16"] = {"enabled": True} with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True): model = ModelWhereParentInitializesChildWeights() @@ -1207,13 +1211,14 @@ def test(self): "lr": 1e-4 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 3 }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 class SubModel(torch.nn.Module): @@ -1284,9 +1289,6 @@ def test(self): "lr": 1e-4 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 1, "offload_optimizer": { @@ -1294,6 +1296,10 @@ def test(self): } }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -1311,6 +1317,8 @@ class TestZero3DictFwd(DistributedTest): world_size = 1 def test(self, return_type): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet") config_dict = { "train_batch_size": 4, "steps_per_print": 1, @@ -1320,13 +1328,14 @@ def test(self, return_type): "lr": 1e-4 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 3 }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 class MyModel(torch.nn.Module): @@ -1391,11 +1400,11 @@ def test(self, zero_stage): "lr": 1e-3 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 4 model = SimpleModel(hidden_dim=hidden_dim, nlayers=12) @@ -1445,13 +1454,14 @@ def test(self, zero_stage): "lr": 1e-4 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": zero_stage }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 class MyModel(torch.nn.Module): @@ -1497,9 +1507,6 @@ def test(self, force_ds_optim): "train_batch_size": 4, "gradient_accumulation_steps": 2, "steps_per_print": 1, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 1, "offload_optimizer": { @@ -1508,6 +1515,10 @@ def test(self, force_ds_optim): }, "zero_force_ds_cpu_optimizer": force_ds_optim, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} hidden_dim = 10 model = SimpleModel(hidden_dim) @@ -1529,15 +1540,15 @@ def test_training_partition_cache(self, training): hidden_dim = 10 config_dict = { "train_batch_size": 2, - "fp16": { - "enabled": True, - "initial_scale_power": 8 - }, "zero_optimization": { "stage": 3, "stage3_param_persistence_threshold": hidden_dim, }, } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} if training: config_dict["optimizer"] = {"type": "Adam"} @@ -1546,13 +1557,11 @@ def test_training_partition_cache(self, training): model, _, _, _ = deepspeed.initialize(model=model, config=config_dict) - dtype = torch.half data_loader = random_dataloader( model=model, total_samples=6, hidden_dim=hidden_dim, device=model.device, - dtype=dtype, ) for _, batch in enumerate(data_loader): @@ -1576,6 +1585,8 @@ class TestEmptyParameterGroup(DistributedTest): world_size = 1 def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_group): + if dtype == torch.float16 and not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") model = SimpleModel(hidden_dim=4, nlayers=4) param_groups = [ { diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py index 0ddf1026eaf8..ec9e9e94aeaf 100644 --- a/tests/unit/runtime/zero/test_zero_context.py +++ b/tests/unit/runtime/zero/test_zero_context.py @@ -6,11 +6,13 @@ from types import SimpleNamespace import torch +import pytest import deepspeed from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape import deepspeed.comm as dist +from deepspeed.accelerator import get_accelerator -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype from unit.simple_model import SimpleModel from utils import setup_serial_env @@ -47,16 +49,17 @@ def forward(self, x): "lr": 0.00015 } }, - "fp16": { - "enabled": True, - "loss_scale": 138. - }, "zero_optimization": { "stage": 3, "stage3_param_persistence_threshold": 1, } } +if get_accelerator().is_fp16_supported(): + config["fp16"] = {"enabled": True, "loss_scale": 138.} +elif get_accelerator().is_bf16_supported(): + config["bf16"] = {"enabled": True} + class TestZeroGatheredParametersFree(DistributedTest): world_size = 1 @@ -124,6 +127,8 @@ def test_scattered_init_dist(self): assert dist.is_initialized() def test_scatter_halftype(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") setup_serial_env() with deepspeed.zero.Init(): @@ -248,7 +253,7 @@ def forward(self, input): with deepspeed.zero.GatheredParameters(net.linear1.weight): assert net.linear1.weight.numel() == net.dim**2 - input = torch.rand(net.dim).to(engine.device).half() + input = torch.rand(net.dim).to(engine.device).to(preferred_dtype()) loss = engine(input) engine.backward(loss) engine.step() diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py index 874a8ea3b676..9d49b6d3ba88 100644 --- a/tests/unit/runtime/zero/test_zero_context_return.py +++ b/tests/unit/runtime/zero/test_zero_context_return.py @@ -8,9 +8,10 @@ import pytest import deepspeed from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +from deepspeed.accelerator import get_accelerator from utils import setup_serial_env -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype class DanglingBias(torch.nn.Linear): @@ -119,16 +120,17 @@ def forward(self, input): "lr": 0.00015 } }, - "fp16": { - "enabled": True, - "loss_scale": 138. - }, "zero_optimization": { "stage": 3, "stage3_param_persistence_threshold": 1, } } +if get_accelerator().is_fp16_supported(): + config["fp16"] = {"enabled": True, "loss_scale": 138.} +elif get_accelerator().is_bf16_supported(): + config["bf16"] = {"enabled": True} + class TestReturnParam(DistributedTest): world_size = 1 @@ -142,7 +144,7 @@ def test_ext_param_return(self): engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): - input = torch.rand(net.dim).to(engine.device).half() + input = torch.rand(net.dim).to(engine.device).to(preferred_dtype()) loss = engine(input) engine.backward(loss) engine.step() @@ -158,7 +160,7 @@ def test_ext_param_returnobj(self): engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): - input = torch.rand(net.dim).to(engine.device).half() + input = torch.rand(net.dim).to(engine.device).to(preferred_dtype()) loss = engine(input) assert len(net._external_params) == 1 assert len(net.dangler._external_params) == 0 @@ -176,7 +178,7 @@ def test_stage_3_output_type(self, output_type): engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(1): - input = torch.rand(net.dim).to(engine.device).half() + input = torch.rand(net.dim).to(engine.device).to(preferred_dtype()) loss = engine(input) if loss is not None: if isinstance(loss, dict): diff --git a/tests/unit/runtime/zero/test_zero_leaf_module.py b/tests/unit/runtime/zero/test_zero_leaf_module.py index 0855acec57e3..1d3b88a04a4e 100644 --- a/tests/unit/runtime/zero/test_zero_leaf_module.py +++ b/tests/unit/runtime/zero/test_zero_leaf_module.py @@ -6,11 +6,12 @@ import deepspeed.comm as dist import torch -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype from unit.simple_model import random_dataloader import deepspeed from deepspeed.utils import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module +from deepspeed.accelerator import get_accelerator class ChooseModuleByCounter(torch.nn.Module): @@ -89,9 +90,6 @@ def _test_set_z3_leaf_modules(self, cls, requires_grad): "lr": 1e-6 } }, - "fp16": { - "enabled": True - }, "zero_optimization": { "stage": 3, "stage3_prefetch_bucket_size": hidden_dim**2, @@ -99,6 +97,10 @@ def _test_set_z3_leaf_modules(self, cls, requires_grad): "stage3_max_reuse_distance": 0, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} model = cls(hidden_dim) @@ -106,7 +108,7 @@ def _test_set_z3_leaf_modules(self, cls, requires_grad): set_z3_leaf_modules(model, [cls]) assert z3_leaf_module(model) - run_model(model, config_dict, hidden_dim, torch.float16, requires_grad) + run_model(model, config_dict, hidden_dim, preferred_dtype(), requires_grad) def test_choose_module_by_counter(self): self._test_set_z3_leaf_modules(ChooseModuleByCounter, True) diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py index b3adfdf96c50..3bb4af3e3d91 100644 --- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py +++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py @@ -7,7 +7,7 @@ import deepspeed.comm as dist import torch -from unit.common import DistributedTest +from unit.common import DistributedTest, preferred_dtype from unit.simple_model import random_dataloader, SimpleModel from unit.util import bf16_required_version_check @@ -18,6 +18,7 @@ from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum from deepspeed.ops.aio import AsyncIOBuilder +from deepspeed.accelerator import get_accelerator WEIGHT_KEY = 'weight' FIRST_ORDER_KEY = 'exp_avg' @@ -112,14 +113,14 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz "lr": 1e-6 } }, - "fp16": { - "enabled": True, - "initial_scale_power": 2 - }, "zero_optimization": { "stage": zero_stage, } } + if get_accelerator().is_fp16_supported(): + config_dict["fp16"] = {"enabled": True, "initial_scale_power": 2} + elif get_accelerator().is_bf16_supported(): + config_dict["bf16"] = {"enabled": True} if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} @@ -139,9 +140,12 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz validate_after_bwd = lambda model: validate_tensor(model, api_type, opt_states=False) validate_after_step = lambda model: validate_tensor(model, api_type, opt_states=True) - run_fragmented_model(model, config_dict, hidden_dim, torch.float16, validate_after_bwd, validate_after_step) + run_fragmented_model(model, config_dict, hidden_dim, preferred_dtype(), validate_after_bwd, + validate_after_step) def test_bf16_fragments(self, frozen_weights): + if get_accelerator().device_name() == "cpu": + pytest.skip("CPU accelerator does not support this test yet.") if frozen_weights: pytest.skip("TODO: Frozen weights not currently supported by BF16 Optimizer") @@ -302,6 +306,8 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp } if dtype == torch.float16: + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8} elif dtype == torch.bfloat16: config_dict["bf16"] = {"enabled": True} diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py index 01ce3d2fe4c9..3357c200bd68 100644 --- a/tests/unit/simple_model.py +++ b/tests/unit/simple_model.py @@ -14,6 +14,7 @@ from deepspeed.accelerator import get_accelerator import deepspeed.comm as dist +from .common import preferred_dtype class SimpleModel(torch.nn.Module): @@ -262,21 +263,21 @@ def forward(self, x, y, **kwargs): return hidden_dim -def random_dataset(total_samples, hidden_dim, device, dtype=torch.half): +def random_dataset(total_samples, hidden_dim, device, dtype=preferred_dtype()): train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype) train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim) train_dataset = torch.utils.data.TensorDataset(train_data, train_label) return train_dataset -def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half): +def random_dataloader(model, total_samples, hidden_dim, device, dtype=preferred_dtype()): batch_size = model.train_micro_batch_size_per_gpu() train_dataset = random_dataset(total_samples, hidden_dim, device, dtype=dtype) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size) return train_loader -def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half): +def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=preferred_dtype()): batch_size = model.train_micro_batch_size_per_gpu() train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype) train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim) diff --git a/tests/unit/util.py b/tests/unit/util.py index 75c3000bd4a2..feec326ede6c 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -7,7 +7,6 @@ import torch from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported from deepspeed.git_version_info import torch_info -from packaging import version as pkg_version def skip_on_arch(min_arch=7): @@ -47,29 +46,21 @@ def bf16_required_version_check(accelerator_check=True): cuda_version_available = CUDA_MAJOR >= 11 nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' + hpu_available = get_accelerator().device_name() == 'hpu' + xpu_available = get_accelerator().device_name() == 'xpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True elif npu_available: return True + elif hpu_available: + return True + elif xpu_available: + return True else: return False -def required_torch_version(min_version=None, max_version=None): - assert min_version or max_version, "Must provide a min_version or max_version argument" - - torch_version = pkg_version.parse(torch.__version__) - - if min_version and pkg_version.parse(str(min_version)) > torch_version: - return False - - if max_version and pkg_version.parse(str(max_version)) < torch_version: - return False - - return True - - def required_amp_check(): from importlib.util import find_spec if find_spec('apex') is None: diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py index d8f12be4f3c6..5cd35baf3510 100644 --- a/tests/unit/utils/test_groups.py +++ b/tests/unit/utils/test_groups.py @@ -18,7 +18,7 @@ def test_get_expert_parallel_ranks(): expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[7,15] """ expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(world_size=16, - model_parallel_size_=2, + tensor_parallel_size_=2, expert_parallel_size_=4) assert expert_parallel_groups == [ [0, 2, 4, 6],