diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml index ea8d2f5f806f8..6e8d5847835d0 100644 --- a/.github/workflows/amd-mi200.yml +++ b/.github/workflows/amd-mi200.yml @@ -48,8 +48,6 @@ jobs: - name: Install (ROCm) apex run: | git clone https://github.com/ROCmSoftwarePlatform/apex.git - cd apex - git checkout torch_2.1_higher CURRENT_VER=$(git rev-parse HEAD) INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version) if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index fc0dac5de9a26..007313964f4aa 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -27,7 +27,7 @@ jobs: env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 0de6832b37c13..0125fa50bc140 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml new file mode 100644 index 0000000000000..5c5caff1ebb04 --- /dev/null +++ b/.github/workflows/hpu-gaudi2-nightly.yml @@ -0,0 +1,85 @@ +name: hpu-gaudi2-nightly + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/hpu-gaudi2-nightly.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + # The type of runner that the job will run on + runs-on: [self-hosted, intel, gaudi2] + container: + image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + ports: + - 80 + options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice + + env: + PT_HPU_LAZY_MODE: 0 + TORCHINDUCTOR_COMPILE_THREADS: 1 + TEST_LIST: | + test_adamw.py + test_bf16.py + test_ds_config_dict.py + test_dynamic_loss_scale.py + test_latest_checkpoint.py + test_moe_checkpoint.py + test_multi_output_model.py + test_other_optimizer.py + test_pipe.py + test_pipeline.py + test_universal_checkpoint.py + test_zero_context_return.py + test_zero_leaf_module.py + test_zero_offloadpp.py + test_zero_tiled.py + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v4 + + - name: Check container state + run: | + ldd --version + hl-smi -L + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + + - name: Install transformers + run: | + git clone https://github.com/huggingface/transformers + cd transformers + git rev-parse --short HEAD + pip install . + + - name: Install deepspeed + run: | + pip install .[dev,autotuning] + ds_report + + - name: Python environment + run: | + pip list + + - name: Unit tests + run: | + unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + cd tests + export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE} + export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS} + TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}') + echo "TEST_LIST ${TEST_LIST}" + pytest --verbose unit/ -k "${TEST_LIST}" diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 4e9ceb32b6b16..9f1a9d973ca27 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -39,7 +39,7 @@ jobs: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] container: - image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ports: - 80 options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml index eb3ac9b03161a..1a13c0f3f4f10 100644 --- a/.github/workflows/no-torch.yml +++ b/.github/workflows/no-torch.yml @@ -19,7 +19,7 @@ permissions: jobs: unit-tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 @@ -30,6 +30,7 @@ jobs: - name: Python environment run: | pip uninstall torch --yes + pip install setuptools pip list - name: Build deepspeed diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 484948b28e34a..639f27498dd9d 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -23,7 +23,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.03-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -47,8 +47,6 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja - # Update packages included in the container that do not support pydantic 2+ to versions that do - python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning,inf] ds_report - name: Python environment @@ -58,8 +56,8 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12" - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12" - name: MII unit tests run: | BRANCH="main" diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 2ad336cac4edc..329a1060f5ebd 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -12,6 +12,7 @@ on: type: string pull_request: paths: + - ".github/workflows/nv-ds-chat.yml" - "deepspeed/runtime/zero/stage_1_and_2.py" - "deepspeed/runtime/zero/stage3.py" - "deepspeed/runtime/hybrid_engine.py" @@ -42,6 +43,7 @@ jobs: - name: Install deepspeed run: | + pip install transformers==4.45.2 pip install .[dev] ds_report diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml index 3de878547d6e7..2ecdf218b96af 100644 --- a/.github/workflows/nv-human-eval.yml +++ b/.github/workflows/nv-human-eval.yml @@ -11,7 +11,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.03-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -50,4 +50,4 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12" diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index 044c282ba1198..f92aa7edfdd51 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -22,7 +22,7 @@ jobs: runs-on: [self-hosted, nvidia, cu121, v100] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index 72ba8abbd95df..fc810bc190d0d 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -21,7 +21,7 @@ concurrency: jobs: unit-tests: - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116 diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml index 0344c80451a68..af406075b868e 100644 --- a/.github/workflows/nv-sd.yml +++ b/.github/workflows/nv-sd.yml @@ -27,7 +27,7 @@ jobs: sd-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.03-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -53,8 +53,6 @@ jobs: pip install image-similarity-measures python -m pip install opencv-python==4.6.* --force-reinstall python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja - # Update packages included in the container that do not support pydantic 2+ to versions that do - python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning,sd] ds_report - name: Python environment @@ -64,7 +62,7 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12" - name: Open GitHub issue if weekly CI fails if: ${{ failure() && (github.event_name == 'schedule') }} diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/nv-torch110-p40.yml index ed639aeb3b62c..31d7805db7bb2 100644 --- a/.github/workflows/nv-torch110-p40.yml +++ b/.github/workflows/nv-torch110-p40.yml @@ -20,7 +20,7 @@ jobs: env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml index 4b9f278448ab7..bb1bc987379c3 100644 --- a/.github/workflows/nv-torch110-v100.yml +++ b/.github/workflows/nv-torch110-v100.yml @@ -20,7 +20,7 @@ jobs: env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 3103e3f36e84f..35f9502ecbc94 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -21,10 +21,10 @@ jobs: unit-tests: strategy: matrix: - pyVersion: ["3.7", "3.8", "3.9", "3.10"] + pyVersion: ["3.8", "3.9", "3.10"] fail-fast: false - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 container: image: deepspeed/gh-builder:py${{ matrix.pyVersion }} diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml index adeeb0acade2f..d19e73aeef1ce 100644 --- a/.github/workflows/xpu-max1100.yml +++ b/.github/workflows/xpu-max1100.yml @@ -36,7 +36,7 @@ jobs: unit-tests: runs-on: [self-hosted, intel, xpu] container: - image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04 + image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 ports: - 80 options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL @@ -47,12 +47,11 @@ jobs: run: | apt-get update apt-get install clinfo libaio-dev python3-pip -y - pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu - pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu - pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu - pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu - pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu - pip install py-cpuinfo numpy==1.26 + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install py-cpuinfo numpy pip install .[dev,autotuning] - name: Check container state diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b89c872eefe5e..b5d8afa8e0b41 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/google/yapf - rev: v0.32.0 + rev: v0.40.0 hooks: - id: yapf @@ -65,7 +65,7 @@ repos: ] - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 5.0.4 hooks: - id: flake8 args: ['--config=.flake8'] diff --git a/COMMITTERS.md b/COMMITTERS.md new file mode 100644 index 0000000000000..bcb8579bf1f79 --- /dev/null +++ b/COMMITTERS.md @@ -0,0 +1,9 @@ +# DeepSpeed TSC Committers # + +| Name | GitHub ID | Affiliation +|--- | ---- | --- | +| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase) | Microsoft | +| Logan Adams | [loadams](https://github.com/loadams) | Microsoft | +| Masahiro Tanaka | [tohtana](https://github.com/tohtana) | Microsoft | +| Jeff Rasley | [jeffra](https://github.com/jeffra) | SnowFlake | +| Minjia Zhang | [minjiazhang](https://github.com/minjiazhang) | UIUC | diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 0000000000000..d488ec55114e7 --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,101 @@ + +# DeepSpeed Project Charter and Governance + +This charter sets forth the responsibilities and procedures for technical contribution to, and oversight of, the DeepSpeed open source project. All contributors (including committers, maintainers, and other technical positions) and other participants in the Project (collectively, "Collaborators") must comply with the terms of this Charter. + +## Mission and Scope of the Project + +The mission of the Project is to DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective. + +The scope of the Project includes collaborative development under the Project License (as defined herein) supporting the mission, including documentation, testing, integration, and the creation of other artifacts that aid the development, deployment, operation, or adoption of the open source project. + +## Technical Steering Committee + +1. The Technical Steering Committee (the "TSC") will be responsible for all technical oversight of the open source Project. + +2. The TSC voting members are initially the Project's Committers. At the inception of the project, the Committers of the Project will be as set forth within the "CONTRIBUTING" file within the Project's code repository. The TSC may choose an alternative approach for determining the voting members of the TSC, and any such alternative approach will be documented in the CONTRIBUTING file. Any meetings of the Technical Steering Committee are intended to be open to the public, and can be conducted electronically, via teleconference, or in person. + +3. TSC projects generally will involve Contributors and Committers. The TSC may adopt or modify roles so long as the roles are documented in the CONTRIBUTING file. Unless otherwise documented: + + - **Contributors** include anyone in the technical community that contributes code, documentation, or other technical artifacts to the Project. + - **Committers** are Contributors who have earned the ability to modify ("commit") source code, documentation, or other technical artifacts in a project's repository. + + - A Contributor may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers. + +4. Participation in the Project through becoming a Contributor and Committer is open to anyone so long as they abide by the terms of this Charter. + +5. The TSC may: + - Establish workflow procedures for the submission, approval, and closure/archiving of projects. + - Set requirements for the promotion of Contributors to Committer status, as applicable. + - Amend, adjust, refine and/or eliminate the roles of Contributors and Committers, and create new roles, and publicly document any TSC roles, as it sees fit. + +6. The TSC may elect a TSC Chair, who will preside over meetings of the TSC and will serve until their resignation or replacement by the TSC. The TSC Chair, or any other TSC member so designated by the TSC, will serve as the primary communication contact between the Project and AI & Data, a directed fund of The Linux Foundation. + +7. Responsibilities: The TSC will be responsible for all aspects of oversight relating to the Project, which may include: + + - Coordinating the technical direction of the Project. + - Approving project or system proposals (including, but not limited to, incubation, deprecation, and changes to a sub-project's scope). + - Organizing sub-projects and removing sub-projects. + - Creating sub-committees or working groups to focus on cross-project technical issues and requirements. + - Appointing representatives to work with other open source or open standards communities. + - Establishing community norms, workflows, issuing releases, and security issue reporting policies. + - Approving and implementing policies and processes for contributing (to be published in the CONTRIBUTING file) and coordinating with the series manager of the Project (as provided for in the Series Agreement, the "Series Manager") to resolve matters or concerns that may arise as set forth in Section 7 of this Charter. + - Discussions, seeking consensus, and where necessary, voting on technical matters relating to the code base that affect multiple projects. + - Coordinating any marketing, events, or communications regarding the Project. + +## TSC Voting + +1. While the Project aims to operate as a consensus-based community, if any TSC decision requires a vote to move the Project forward, the voting members of the TSC will vote on a one vote per voting member basis. + +2. Quorum for TSC meetings requires at least fifty percent of all voting members of the TSC to be present. The TSC may continue to meet if quorum is not met but will be prevented from making any decisions at the meeting. + +3. Except as provided in Section 7.c. and 8.a, decisions by vote at a meeting require a majority vote of those in attendance, provided quorum is met. Decisions made by electronic vote without a meeting require a majority vote of all voting members of the TSC. + +4. In the event a vote cannot be resolved by the TSC, any voting member of the TSC may refer the matter to the Series Manager for assistance in reaching a resolution. + +## Compliance with Policies + +1. This Charter is subject to the Series Agreement for the Project and the Operating Agreement of LF Projects. Contributors will comply with the policies of LF Projects as may be adopted and amended by LF Projects, including, without limitation, the policies listed at https://lfprojects.org/policies/. + +2. The TSC may adopt a code of conduct ("CoC") for the Project, which is subject to approval by the Series Manager. In the event that a Project-specific CoC has not been approved, the LF Projects Code of Conduct listed at https://lfprojects.org/policies will apply for all Collaborators in the Project. + +3. When amending or adopting any policy applicable to the Project, LF Projects will publish such policy, as to be amended or adopted, on its website at least 30 days prior to such policy taking effect; provided, however, that in the case of any amendment of the Trademark Policy or Terms of Use of LF Projects, any such amendment is effective upon publication on LF Project's website. + +4. All Collaborators must allow open participation from any individual or organization meeting the requirements for contributing under this Charter and any policies adopted for all Collaborators by the TSC, regardless of competitive interests. Put another way, the Project community must not seek to exclude any participant based on any criteria, requirement, or reason other than those that are reasonable and applied on a non-discriminatory basis to all Collaborators in the Project community. + +5. The Project will operate in a transparent, open, collaborative, and ethical manner at all times. The output of all Project discussions, proposals, timelines, decisions, and status should be made open and easily visible to all. Any potential violations of this requirement should be reported immediately to the Series Manager. + +## Community Assets + +1. LF Projects will hold title to all trade or service marks used by the Project ("Project Trademarks"), whether based on common law or registered rights. Project Trademarks will be transferred and assigned to LF Projects to hold on behalf of the Project. Any use of any Project Trademarks by Collaborators in the Project will be in accordance with the license from LF Projects and inure to the benefit of LF Projects. + +2. The Project will, as permitted and in accordance with such license from LF Projects, develop and own all Project GitHub and social media accounts, and domain name registrations created by the Project community. + +3. Under no circumstances will LF Projects be expected or required to undertake any action on behalf of the Project that is inconsistent with the tax-exempt status or purpose, as applicable, of the Joint Development Foundation or LF Projects, LLC. + +## General Rules and Operations + +The Project will: + +1. Engage in the work of the Project in a professional manner consistent with maintaining a cohesive community, while also maintaining the goodwill and esteem of LF Projects, Joint Development Foundation, and other partner organizations in the open source community. +2. Respect the rights of all trademark owners, including any branding and trademark usage guidelines. + +## Intellectual Property Policy + +1. Collaborators acknowledge that the copyright in all new contributions will be retained by the copyright holder as independent works of authorship and that no contributor or copyright holder will be required to assign copyrights to the Project. + +2. Except as described in Section 7.c., all contributions to the Project are subject to the following: + + - All new inbound code contributions to the Project must be made using Apache License, Version 2.0 available at http://www.apache.org/licenses/LICENSE-2.0 (the "Project License"). + - All new inbound code contributions must also be accompanied by a Developer Certificate of Origin (http://developercertificate.org) sign-off in the source code system that is submitted through a TSC-approved contribution process which will bind the authorized contributor and, if not self-employed, their employer to the applicable license. + - All outbound code will be made available under the Project License. + - Documentation will be received and made available by the Project under the Creative Commons Attribution 4.0 International License (available at http://creativecommons.org/licenses/by/4.0/). + - The Project may seek to integrate and contribute back to other open source projects ("Upstream Projects"). In such cases, the Project will conform to all license requirements of the Upstream Projects, including dependencies, leveraged by the Project. Upstream Project code contributions not stored within the Project's main code repository will comply with the contribution process and license terms for the applicable Upstream Project. + +3. The TSC may approve the use of an alternative license or licenses for inbound or outbound contributions on an exception basis. To request an exception, please describe the contribution, the alternative open source license(s), and the justification for using an alternative open source license for the Project. License exceptions must be approved by a two-thirds vote of the entire TSC. + +4. Contributed files should contain license information, such as SPDX short form identifiers, indicating the open source license or licenses pertaining to the file. + +## Amendments + +1. This charter may be amended by a two-thirds vote of the entire TSC and is subject to approval by LF Projects. diff --git a/README.md b/README.md index b302e32dfd9ca..8bfc344a4edc7 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Latest News DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). - +* [2024/12] [DeepSpeed-Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md) * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/windows/08-2024/japanese/README.md)] * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)] * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)] @@ -121,7 +121,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | | Documentation | | ---------------------------------------------------------------------------------------------- | -------------------------------------------- | - | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) | + | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/deepspeed) | | | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) | | | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) | | | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) | @@ -142,7 +142,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) | | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) | | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) | -| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml/badge.svg?branch=master)](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml) | +| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml/badge.svg?branch=main)](https://github.com/Ascend/Ascend-CI/actions/workflows/deepspeed.yaml) | # Installation diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 1e4335b192928..0e49bd9f64581 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -71,6 +71,8 @@ def device_count(self): # In flat mode, HBM is in separate NUMA node with no cores on this node. # Ignore these NUMA nodes with no cores. numa_core_lists = get_numa_cores() + if not numa_core_lists: + return 1 numa_count = 0 prev_core_list = [] for core_list in numa_core_lists: diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 6da48000dafa4..ad8a10710bf2c 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -26,8 +26,8 @@ def is_synchronized_device(self): return False def use_host_timers(self): - # WA XPU event will be consolidated in 2.5 - if ipex.__version__ < '2.5': + # WA XPU event will be consolidated in 2.6 + if ipex.__version__ < '2.6': return True else: return self.is_synchronized_device() diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md index 23ead99f5c8d2..3ec570a9ea407 100644 --- a/blogs/deepspeed-chat/japanese/README.md +++ b/blogs/deepspeed-chat/japanese/README.md @@ -332,7 +332,7 @@ DeepSpeedは、以下のような機能を提供します。 DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。 -またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。 +またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。 DeepSpeedについてのより詳しい情報は、以下をご覧ください。 diff --git a/blogs/deepspeed-domino/README.md b/blogs/deepspeed-domino/README.md new file mode 100644 index 0000000000000..7dfdc7dac1c09 --- /dev/null +++ b/blogs/deepspeed-domino/README.md @@ -0,0 +1,199 @@ +

+ domino logo +

+ +
+ +# Domino: Communication-Free LLM Training Engine + +
+ +
+ + +
+ +*Figure 1: Project Domino is Microsoft DeepSpeed's Tensor Parallel (TP) Training Engine, which provides a uniform solution for both single-node and **multi-node** cases. Domino scales up traditional single-node-only TP solution to multi-node environments via **near-complete communication hiding** behind computation.* + +
+

+ +# Table of Content +1. [Introduction](#introduction) +2. [Domino Highlights](#domino-highlights) +3. [Design Motivation](#design-motivation) +4. [Domino Design](#domino-design) +5. [Implementation and Optimization](#implementation-and-optimization) +6. [Getting Started: Try out DeepSpeed-Domino](#getting-started-try-out-deepspeed-domino) +7. [Citation](#citation) +8. [Acknowledgements](#acknowledgements) + + +# Introduction + +Generative AI (GenAI) has enabled transformative applications in a wide variety of domains, including chatbot, text summarization, and high-quality image and video generation. These capabilities are built on top of large foundation models, particularly Large Language Models (LLMs). LLMs are typically based on the [Transformer](https://arxiv.org/abs/1706.03762) network architecture, and include popular model families such as GPT and Llama. LLMs have grown beyond the memory capacity of a single accelerator (e.g., GPU), and so inferencing or training them requires distributed processing using multiple GPUs or even multiple nodes. + +Tensor parallelism (TP) is a popular distributed technique for training LLMs. TP leverages the aggregate memory of multiple GPUs to fit LLMs by partitioning each model layer across the GPUs. However, TP incurs two communication collective operations for each partitioned layer, separately for the forward and backward passes. TP is appealing due to its excellent system efficiency in single-node cases, where GPUs are directly connected via high bandwidth links like NVLink and NVSwitch. However, TP falls short in multi-node cases due to the lower bandwidth of cross-node interconnects. [Prior work](https://arxiv.org/abs/2406.06858) reports that communication can take up to 75\% of end-to-end training time. Figure 2 shows that even on the latest DGX-H100 nodes interconnected with high-end Infiniband of 400GB/s bandwidth, communication overheads remains as high as 43\% of end-to-end training iteration time. Recent advances in GeMM+NCCL kernel fusion are unable to fully hide communication overheads due to their limited scope of computation-communication overlapping. The trend of faster compute in newer GPUs (e.g., DGX-B200) indicates that the communication overheads of TP will be more pronounced in both single node and multiple node scenarios. + +
+
+ + *Figure 2: TP communication overhead in GPT-3-13B training using 1,2,4 DGX-H100 nodes (i.e., 8, 16, 32 H100 GPUs).* + +
+ +# Domino Highlights + + +* Domino is TP optimization technique that achieves **Near-Complete** communication hiding behind computation by decomposing a single batch training iteration into smaller and independent pieces, allowing efficient pipelining. + +Domino is the first work that provides a **uniform** Tensor Parallelism (TP) solution for both single-node and **multi-node** cases. Traditional TP solutions (e.g., Megatron-LM) fall short in multi-node cases due to limited cross-node communication bandwidth. + +### Performance + +We tested Domino on 1 to 4 DGX-H100 boxes (8xH100 per box). Each node has intra-node NVLink bandwidth of 900GB/s and inter-node IB bandwidth of 400GB/s. We oberved the following performance results: +1. For both GPT and Llama model series, Domino outperforms Megatron-LM by up to **1.3x** and **1.2x** respectively in end-to-end training iteration throughput for different model sizes, sequence lengths and batch sizes. These results are summarized in Figure 1. +2. For several cases, Domino achieves **near-optimal** training throughput, where optimal throughput refers to the throughput achieved assuming the communication collectives of TP are disabled. + +For more detailed performance results, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241). + +# Design Motivation + +In this section, we briefly discuss three topics. First, we motivate why the time is right is for a uniform TP solution for both single node and multi-node cases. Next, we analyze the communication overhead on latest Nvidia DGX-H100 boxes with high cross-node communication interconnects. Finally, we describe TP's sequential data dependency which causing communication stands out. + +### It is time for a uniform TP for single and multi-node scenarios + +Nvidia is pushing hard on breaking communication bandwidth gap between intra-node (i.e., GPUs within a node connected with NVLink) and inter-node (i.e., cross-node connected with Infini-Band(IB)). For example, each DGX-H100 is equipped with eight ConnectX-7 network cards and gets aggregated cross-node bandwidth of 400GB/s, which is at same level of intra-node NVLink (900GB/s). Therefore, it is time for proposing a uniform solution for both single node and multi-node TP training. + +### Communication Overhead in TP + +As described in [Megatron-LM paper](https://arxiv.org/pdf/1909.08053), for TP, every transformer block (i.e.,1 Self-Attention layer + 1 MLP layer) incurs 4 AllReduce calls, two in forward pass and two in the backward pass (shown in Figure 3). Given a LLM consisting of $N$ stacked transformer blocks, the number of AllReduce calls required for TP training is $4 * N$. Even for small models like GPT-3 2.7B or 6.7B which consists of 32 layers, the total number of AllReduce calls is 128 for every training iteration. For larger models, the number of AllReduce calls grows linearly with number of layers. + +
+
+ + *Figure 3: TP communication = 4 x AllReduce x num\_transformer\_block* + +
+ +One big issue for TP is that the *communication resides on critical path of every input batch training execution* due to sequential data dependency we described in the following [TP data dependency analysis](#tp-data-dependency-analysis) section. Therefore, the communication overhead stands out and is difficult to hide behind computation. In Figure 4, we provide our communication overhead measurement using Megatron-LM training GPT-3 and Llama-2 model series with different model sizes and batch sizes across 1 to 4 DGX-H100 nodes (i.e., 8 to 32 H100 GPUs). The communication overhead is up to **47\%** despite using latest Nvidia hardware DGX-H100 with 400GB/s cross-node bandwidth. + +
+
+ + *Figure 4: TP communication and computation ratio per training iteration time over different models and batch sizes using 1 to 4 DGX-H100 nodes.* + +
+ +As Llama-3 405B model training takes 54 days on 16,000 H100 GPUs, the projected communication time can be up to around **25 days on 16,000 H100s**. This finding shows that, despite using latest high-bandwidth interconnects like NVLink/Infini-Band(IB), the communication overheads of TP remains a huge portion of end-to-end training time. + +### TP data dependency analysis + +In traditional TP, shown in Figure 5, a transformer layer (either Attn or MLP layer) computation can be abstracted into $X\*A\*B=Y$, where $X$ is input. For attention layer, $A$ is attention computation (e.g., multihead-attention) and $B$ is linear layer. For MLP layer, both $A$ and $B$ are linear layers. An AllReduce is conducted on $Y$ after computation. Due to **sequential data dependency on $Y$ between computation (i.e., $X\*A\*B=Y$) and communication (i.e., AllReduce($Y$)), AllReduce($Y$) completely stands out**, thus making TP not efficient in limited communication bandwidth scenarios. + +
+
+
+ + *Figure 5: TP Forward pass of single Self-Attention/MLP layer. (X is input, A is attention computation for Self-Attention layer and linear for MLP layer, B is linear for both Self-Attention and MLP layer. Y is X\*A\*B output)* + +
+
+ + +# Domino Design + +Compared to Figure 5, Domino breaks data dependency of $X\*A\*B$ via [*Row-wise Split on Inputs X*](#row-wise-split-on-inputs-x), [*Column-wise Split on Weights B*](#column-wise-split-on-weights-b), as well as a [hybrid solution combining these two](#2d-split-on-both-x-and-b). After breaking computation into pieces, Domino pipelines computation and communication working on different independent pieces, thus achieving near-complete communication hiding behind computation. Domino's unique benefits are listed as follows: + +1. Comparing with GeMM+NCCL kernel fusion techniques, Domino breaks data dependency thus has a much wider range of computation kernel sequences to overlap with NCCL call. For example, Domino can overlap AllReduce not only to a single GeMM, but also extend overlapping scope to multiple GeMMs, LayerNorm, DropOut and more. +2. Domino achieves near-complete communication hiding behind computation, thus also achieves near-optimal system throughput in certain cases. (Optimal throughput refers to end-to-end throughput that disables all communication in TP training.) +3. Domino works at kernel scheduler level, any kernel optimizations or new kernels can be seamlessly integrated into Domino framework. +4. Domino tensor partition scheme is simple and generic. It is easy for user side end-to-end correctness debugging when facing issues like overflow or weights/gradients errors. + +For the ease of illustration, we describe forward propagation only (since backward pass is just in reverse order), and we describe only splitting tensor into two chunks. + +## Row-wise split on Inputs X: + +Domino breaks Input X in row dimension (i.e. batch dimension). + +
+
+ + *Figure 6: Domino row-wise (batch-dim) split on inputs X.* + +
+ +**Data Dependency**: Split inputs' batch dimension has no data dependency for both intra-layer and inter-layer cases. Therefore, we achieve both *intra-layer* (AllReduce($Y1$) and $X2\*A\*B$) and *inter-layer* (AllReduce($Y2$) and next-layer's $X1\*A\*B$) computation-communication overlapping. With this batch split on inputs, Domino can hide up to **100\%** communication behind computation. + +## Column-wise split on Weights B: + +Domino breaks weight matrix B in column dimension. + + +
+
+ + *Figure 7: Domino column-wise (last-dim) split on weights B.* + +
+ +**Data Dependency**: Split Weights B column-wise have no data dependency in intra-layer case but have data dependency in inter-layer case. Therefore, we only achieve *intra-layer* + (AllReduce($Y1$) and $X2\*A\*B$) computation-communication overlapping. This column-split on weights scheme remains essential, since row-wise input split only would lead to narrow shape tensors that hinder kernel computational efficiency. In practice, Domino achieves 50\% to 70\% communication hiding behind computation with weights B column-wise split. + +## 2D Split on both X and B: + +For extremely large LLMs, Domino splits both inputs X and weights B in row and column dimension, separately. This method is beneficial for model training requiring both low memory footprints and minimizing communication overheads. + +
+
+ + *Figure 8: Domino 2D split on both inputs X and weights B.* + +
+ +**Data Dependency**: This 2D split policy inherits synchronization at the end of each transformer layer due to column-wise split on weights B. Therefore, the 2D approach only achieves *intra-layer* computation-communication overlapping. + +# Implementation and Optimization + +For brevity, we summarize key implementation of row-wise input split. For more implementation details, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241). + +**Forward:** Figure 9 shows how we position and trigger NCCL calls in order to overlap with computation kernel sequences in forward propagation. We split batch into two chunks as $\mu$-batch0 and $\mu$-batch1. $\mu$-batch0 attention output as attn0 and MLP output as MLP0. $\mu$-batch1's attention output as attn1 and MLP output as MLP1. AllReduce(attn0) is overlapped with self-attention computation on $\mu$-batch1. For AllReduce(attn1), we group multiple $\mu$-batches' Dropout, Residual, LayerNorm computation-communication overlapping. This small kernel grouping not only enable complete hiding of AllReduce(attn1), but also provides proper overlapping space for AllReduce(MLP0) in the backward pass shown in Figure 10. For AllReduce(MLP0), we hide it behind $\mu$-batch1's MLP computation kernel sequence of GeMM + GeLU + GeMM. For AllReduce(MLP1), we hide it behind next layer's attention computation. + +
+
+ + *Figure 9: Transformer block (i.e., 1 self-attn + 1 MLP) forward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.* + +
+ +**Backward:** Figure 10 shows a simple example of batch split in to two $\mu$-batches as $\mu$-batch0 and $\mu$-batch1. Besides similar overlapping strategy in the forward pass, we extend the scope of overlap communication with weights' gradient computation inside same $\mu$-batch (e.g., AllReduce(MLP1) partially overlaps with its own $\mu$-batch1 computation as the 3rd orange block from left). Each *grad matmul* includes two separate GeMM computation for inputs gradient and weights gradient. Therefore, we can extend overlapping scope by overlapping AllReduce(MLP1) with $\mu$-batch1's weights gradient computation. + +Backward is a bit more challenging because backward computation graph is automatically generated by torch.autograd(). To precisely control NCCL call triggering time, we implement a *no\_operation* module, which obtains communication handle during forward pass and retains it for use during backward pass. Our *no\_operation* module works seamlessly with torch.autograd(), and enable us precisely control NCCL start/end time without rewriting customized backward computation graph. + +
+
+ + *Figure 10: Transformer block (i.e., 1 self-attn + 1 MLP) backward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.* + +
+ +**General kernel optimizations:** We adopt general kernel-level optimization techniques. For example, we use cudaGraph to squeeze idle/bubble time between adjacent compute kernels to reduce end-to-end latency. We use CUDA multi-stream to increase parallel execution. We also leverage torch.compile() to further improve our system efficiency. + +# Getting Started: Try out DeepSpeed-Domino + +To try out DeepSpeed-Domino, please refer to [Domino tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in our DeepSpeedExample repo. + +## Citation + +``` +@article{wang2024-deepspeed-domino, + title={{Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping}}, + author={Guanhua Wang and Chengming Zhang and Zheyu Shen and Ang Li and Olatunji Ruwase}, + journal={arXiv preprint arXiv:2409.15241}, + year={2024} +} +``` + +## Acknowledgements + +This work is the result of a deep collaboration between Microsoft DeepSpeed and our academia partners from University of Maryland, University of Houston. The contributors include [Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), [Hongwei Chen](https://github.com/hwchen2017) and [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/) from Microsoft DeepSpeed Team, [Chengming Zhang](https://chengmingzh8.github.io/) from University of Houston, [Zheyu Shen](https://www.linkedin.com/in/zheyushen/) and [Ang Li](https://www.ang-li.com/) from University of Maryland. diff --git a/blogs/deepspeed-domino/images/design-base.png b/blogs/deepspeed-domino/images/design-base.png new file mode 100644 index 0000000000000..d347e9c2ba8b1 Binary files /dev/null and b/blogs/deepspeed-domino/images/design-base.png differ diff --git a/blogs/deepspeed-domino/images/design-column.png b/blogs/deepspeed-domino/images/design-column.png new file mode 100644 index 0000000000000..a99ad3c6b461e Binary files /dev/null and b/blogs/deepspeed-domino/images/design-column.png differ diff --git a/blogs/deepspeed-domino/images/design-hybrid.png b/blogs/deepspeed-domino/images/design-hybrid.png new file mode 100644 index 0000000000000..302e3f95e8fc0 Binary files /dev/null and b/blogs/deepspeed-domino/images/design-hybrid.png differ diff --git a/blogs/deepspeed-domino/images/design-row.png b/blogs/deepspeed-domino/images/design-row.png new file mode 100644 index 0000000000000..551a54f4e651d Binary files /dev/null and b/blogs/deepspeed-domino/images/design-row.png differ diff --git a/blogs/deepspeed-domino/images/domino-hero.png b/blogs/deepspeed-domino/images/domino-hero.png new file mode 100644 index 0000000000000..078b6472b42aa Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-hero.png differ diff --git a/blogs/deepspeed-domino/images/domino-logo.png b/blogs/deepspeed-domino/images/domino-logo.png new file mode 100644 index 0000000000000..58be0990b9448 Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-logo.png differ diff --git a/blogs/deepspeed-domino/images/gpt3-scale.png b/blogs/deepspeed-domino/images/gpt3-scale.png new file mode 100644 index 0000000000000..611b2221a73c8 Binary files /dev/null and b/blogs/deepspeed-domino/images/gpt3-scale.png differ diff --git a/blogs/deepspeed-domino/images/implement-bwd.png b/blogs/deepspeed-domino/images/implement-bwd.png new file mode 100644 index 0000000000000..4b115222f3871 Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-bwd.png differ diff --git a/blogs/deepspeed-domino/images/implement-fwd.png b/blogs/deepspeed-domino/images/implement-fwd.png new file mode 100644 index 0000000000000..51d3a73bae58c Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-fwd.png differ diff --git a/blogs/deepspeed-domino/images/tp-ar.png b/blogs/deepspeed-domino/images/tp-ar.png new file mode 100644 index 0000000000000..6dd01ccceed8c Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-ar.png differ diff --git a/blogs/deepspeed-domino/images/tp-comm-overhead.png b/blogs/deepspeed-domino/images/tp-comm-overhead.png new file mode 100644 index 0000000000000..947473ff5261c Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-comm-overhead.png differ diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp index 5790e79e2bc2a..e276ad0856dd6 100644 --- a/csrc/adagrad/cpu_adagrad.cpp +++ b/csrc/adagrad/cpu_adagrad.cpp @@ -17,9 +17,9 @@ static std::unordered_map> s_optimizers; // C++ interface -template -void Adagrad_Optimizer::Step_1(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adagrad_Optimizer::Step_1(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg_sq, size_t _param_size) { @@ -56,9 +56,9 @@ void Adagrad_Optimizer::Step_1(ds_params_percision_t* _params, } } -template -void Adagrad_Optimizer::Step_4(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adagrad_Optimizer::Step_4(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg_sq, size_t _param_size) { @@ -104,9 +104,9 @@ int create_adagrad_optimizer(int optimizer_id, return 0; } -template -void Adagrad_Optimizer::Step_8(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adagrad_Optimizer::Step_8(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg_sq, size_t _param_size) { @@ -121,15 +121,15 @@ void Adagrad_Optimizer::Step_8(ds_params_percision_t* _params, (_param_size - rounded_size)); } -template +template void step_invoker(std::shared_ptr opt, void* _params, void* grads, void* _exp_avg_sq, size_t _param_size) { - opt->Step_8((ds_params_percision_t*)(_params), - (ds_params_percision_t*)(grads), + opt->Step_8((ds_params_precision_t*)(_params), + (ds_params_precision_t*)(grads), (ds_state_precision_t*)(_exp_avg_sq), _param_size); } @@ -139,12 +139,12 @@ std::map, invokers; // Fill map with template functions for each type -template +template void create_invoker() { - invokers[std::tuple(c10::CppTypeToScalarType(), + invokers[std::tuple(c10::CppTypeToScalarType(), c10::CppTypeToScalarType())] = - step_invoker; + step_invoker; } struct InvokerInitializer { InvokerInitializer() diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp index 15d4e74d69d5f..465aae7b9a343 100644 --- a/csrc/adam/cpu_adam_impl.cpp +++ b/csrc/adam/cpu_adam_impl.cpp @@ -18,9 +18,9 @@ static std::unordered_map> s_optimizers; // C++ interface -template -void Adam_Optimizer::Step_1(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adam_Optimizer::Step_1(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, ds_state_precision_t* _exp_avg_sq, size_t _param_size) @@ -67,9 +67,9 @@ void Adam_Optimizer::Step_1(ds_params_percision_t* _params, } } -template -void Adam_Optimizer::Step_4(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adam_Optimizer::Step_4(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, ds_state_precision_t* _exp_avg_sq, size_t _param_size) @@ -126,9 +126,9 @@ int create_adam_optimizer(int optimizer_id, return 0; } -template -void Adam_Optimizer::Step_8(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Adam_Optimizer::Step_8(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, ds_state_precision_t* _exp_avg_sq, size_t _param_size) @@ -145,7 +145,7 @@ void Adam_Optimizer::Step_8(ds_params_percision_t* _params, (_param_size - rounded_size)); } -template +template void step_invoker(std::shared_ptr opt, void* _params, void* grads, @@ -153,8 +153,8 @@ void step_invoker(std::shared_ptr opt, void* _exp_avg_sq, size_t _param_size) { - opt->Step_8((ds_params_percision_t*)(_params), - (ds_params_percision_t*)(grads), + opt->Step_8((ds_params_precision_t*)(_params), + (ds_params_precision_t*)(grads), (ds_state_precision_t*)(_exp_avg), (ds_state_precision_t*)(_exp_avg_sq), _param_size); @@ -165,12 +165,12 @@ std::map, invokers; // Fill map with template functions for each type -template +template void create_invoker() { - invokers[std::tuple(c10::CppTypeToScalarType(), + invokers[std::tuple(c10::CppTypeToScalarType(), c10::CppTypeToScalarType())] = - step_invoker; + step_invoker; } struct InvokerInitializer { InvokerInitializer() diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp index 0536ff6a362e7..fb269b58315fb 100644 --- a/csrc/aio/common/deepspeed_aio_utils.cpp +++ b/csrc/aio/common/deepspeed_aio_utils.cpp @@ -19,9 +19,14 @@ const int c_io_queue_depth = 8; io_xfer_ctxt::io_xfer_ctxt(const int fd, const int64_t file_offset, + const int64_t buffer_offset, const int64_t num_bytes, const void* buffer) - : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes) + : _fd(fd), + _file_base_offset(file_offset), + _buffer_base_offset(buffer_offset), + _mem_buffer(buffer), + _num_bytes(num_bytes) { } @@ -41,9 +46,10 @@ void io_prep_context::prep_iocbs(const int n_iocbs, assert(static_cast(n_iocbs) <= _iocbs->size()); for (auto i = 0; i < n_iocbs; ++i) { const auto shift = i * _block_size; - const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift; - const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift; + const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_buffer_base_offset + shift; + const auto xfer_offset = _xfer_ctxt->_file_base_offset + start_offset + shift; auto byte_count = _block_size; + if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; } if (_read_op) { @@ -79,10 +85,10 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector* auto actual_n_iocbs = min(static_cast(n_iocbs), _remaining_io_blocks); for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) { - const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size); - const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset; + const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + _xfer_ctxt->_buffer_base_offset + + (_next_iocb_index * _block_size); + const auto xfer_offset = _xfer_ctxt->_file_base_offset + (_next_iocb_index * _block_size); const auto num_bytes = min(static_cast(_block_size), _remaining_bytes); - if (_read_op) { io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset); } else { diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h index 20e81fe8eebd7..6b7599acecb44 100644 --- a/csrc/aio/common/deepspeed_aio_utils.h +++ b/csrc/aio/common/deepspeed_aio_utils.h @@ -30,12 +30,14 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. struct io_xfer_ctxt { const int _fd; - const int64_t _base_offset; + const int64_t _file_base_offset; + const int64_t _buffer_base_offset; const void* _mem_buffer; const int64_t _num_bytes; io_xfer_ctxt(const int fd, const int64_t file_offset, + const int64_t buffer_offset, const int64_t num_bytes, const void* buffer); }; diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp index 6f311c5400c72..9452513972257 100644 --- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp @@ -13,14 +13,16 @@ io_op_desc_t::io_op_desc_t(const bool read_op, const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate) + const bool validate, + const int64_t file_offset) : _read_op(read_op), _buffer(buffer), _fd(fd), _filename(filename), _file_num_bytes(file_num_bytes), + _file_offset(file_offset), _intra_op_parallelism(intra_op_parallelism), - _num_bytes_per_thread(file_num_bytes / intra_op_parallelism), + _num_bytes_per_thread(static_cast(buffer.nbytes()) / intra_op_parallelism), _validate(validate) { } diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h index f841b8ce520a5..ac1cdf90f78be 100644 --- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h +++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h @@ -19,6 +19,7 @@ struct io_op_desc_t { const int64_t _num_bytes_per_thread; torch::Tensor _contiguous_buffer; const bool _validate; + const int64_t _file_offset; io_op_desc_t(const bool read_op, const torch::Tensor& buffer, @@ -26,7 +27,8 @@ struct io_op_desc_t { const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate); + const bool validate, + const int64_t file_offset); virtual void run(const int tid, std::unique_ptr& aio_ctxt, diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp index da2ff568d74bf..56fb33fb18869 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp +++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp @@ -16,8 +16,16 @@ cpu_op_desc_t::cpu_op_desc_t( const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate) - : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate), + const bool validate, + const int64_t file_offset) + : io_op_desc_t(read_op, + buffer, + fd, + filename, + file_num_bytes, + intra_op_parallelism, + validate, + file_offset), _cpu_buffer(buffer), _pinned_tensor_mgr(pinned_tensor_mgr), _is_managed_bounce_buffer(false) @@ -66,10 +74,11 @@ void cpu_op_desc_t::run(const int tid, deepspeed_aio_config_t* aio_config) { assert(tid < _intra_op_parallelism); - const auto base_offset = _num_bytes_per_thread * tid; + const auto buffer_base_offset = _num_bytes_per_thread * tid; + const auto file_base_offset = _file_offset + (_num_bytes_per_thread * tid); - std::unique_ptr xfer_ctxt( - new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr())); + std::unique_ptr xfer_ctxt(new io_xfer_ctxt( + _fd, file_base_offset, buffer_base_offset, _num_bytes_per_thread, data_ptr())); if (aio_config->_overlap_events) { do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr); diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h index 9de2fa2540486..debaf4a90731f 100644 --- a/csrc/aio/py_lib/deepspeed_cpu_op.h +++ b/csrc/aio/py_lib/deepspeed_cpu_op.h @@ -20,7 +20,8 @@ struct cpu_op_desc_t : io_op_desc_t { const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate); + const bool validate, + const int64_t file_offset); void run(const int tid, std::unique_ptr& aio_ctxt, diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp index 02b04057d1ac3..1ff0397043fae 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp @@ -52,7 +52,9 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer, auto write_buffer = (char*)buffer.data_ptr(); const auto num_write_bytes = static_cast(buffer.nbytes()); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer)); + + std::unique_ptr xfer_ctxt( + new io_xfer_ctxt(fd, 0, 0, num_write_bytes, write_buffer)); std::unique_ptr aio_ctxt(new aio_context(config._block_size, config._queue_depth)); if (config._overlap_events) { @@ -97,7 +99,8 @@ int deepspeed_py_aio_read(torch::Tensor& buffer, auto read_buffer = (char*)buffer.data_ptr(); assert(static_cast(buffer.nbytes()) == num_file_bytes); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer)); + std::unique_ptr xfer_ctxt( + new io_xfer_ctxt(fd, 0, 0, num_file_bytes, read_buffer)); std::unique_ptr aio_ctxt(new aio_context(config._block_size, config._queue_depth)); if (config._overlap_events) { diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp index 48ea8a1339d45..64d7c2e0541e1 100644 --- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp @@ -58,7 +58,10 @@ const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_e const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; } -int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate) +int deepspeed_io_handle_t::read(torch::Tensor& buffer, + const char* filename, + const bool validate, + const int64_t file_offset) { const auto start_time = std::chrono::high_resolution_clock::now(); @@ -76,7 +79,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con if (fd == -1) { return -1; } auto read_buffer = (char*)buffer.data_ptr(); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer)); + std::unique_ptr xfer_ctxt( + new io_xfer_ctxt(fd, file_offset, 0, num_file_bytes, read_buffer)); if (_aio_config._overlap_events) { do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); @@ -98,7 +102,8 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con int deepspeed_io_handle_t::write(const torch::Tensor& buffer, const char* filename, - const bool validate) + const bool validate, + const int64_t file_offset) { assert(_aio_ctxt); @@ -109,7 +114,8 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer, auto write_buffer = (char*)buffer.data_ptr(); const auto num_write_bytes = static_cast(buffer.nbytes()); - std::unique_ptr xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer)); + std::unique_ptr xfer_ctxt( + new io_xfer_ctxt(fd, file_offset, 0, num_write_bytes, write_buffer)); if (_aio_config._overlap_events) { do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr); @@ -206,7 +212,8 @@ std::shared_ptr deepspeed_io_handle_t::_create_io_op_desc( const int fd, const char* filename, const int64_t file_num_bytes, - const bool validate) + const bool validate, + const int64_t file_offset) { return std::make_shared(read_op, buffer, @@ -215,13 +222,15 @@ std::shared_ptr deepspeed_io_handle_t::_create_io_op_desc( filename, file_num_bytes, _intra_op_parallelism, - validate); + validate, + file_offset); } int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, const char* filename, const bool validate, - const bool async) + const bool async, + const int64_t file_offset) { int64_t num_file_bytes; if (-1 == get_file_size(filename, num_file_bytes)) { @@ -229,20 +238,18 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, report_file_error(filename, " fstat for read", error_code); return -1; } + + // buffer can exceed file size to enable 4k alignment const auto buffer_bytes = static_cast(buffer.nbytes()); - if (buffer_bytes != num_file_bytes) { - std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes - << " != " << num_file_bytes << std::endl; - } - assert(buffer_bytes == num_file_bytes); assert((num_file_bytes % _intra_op_parallelism) == 0); - if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; } + if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; } const auto fd = open_file(filename, true); if (fd == -1) { return -1; } - auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate); + auto scheduled_op = + _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset); _schedule_aio_work(scheduled_op); @@ -254,7 +261,8 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer, int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer, const char* filename, const bool validate, - const bool async) + const bool async, + const int64_t file_offset) { const auto num_write_bytes = static_cast(buffer.nbytes()); assert((num_write_bytes % _intra_op_parallelism) == 0); @@ -264,7 +272,8 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer, const auto fd = open_file(filename, false); if (fd == -1) { return -1; } - auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate); + auto scheduled_op = + _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset); _schedule_aio_work(scheduled_op); @@ -273,24 +282,32 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer, return wait(); } -int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename) +int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, + const char* filename, + const int64_t file_offset) { - return pread(buffer, filename, false, false); + return pread(buffer, filename, false, false, file_offset); } -int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename) +int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, + const char* filename, + const int64_t file_offset) { - return pwrite(buffer, filename, false, false); + return pwrite(buffer, filename, false, false, file_offset); } -int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename) +int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, + const char* filename, + const int64_t file_offset) { - return pread(buffer, filename, false, true); + return pread(buffer, filename, false, true, file_offset); } -int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename) +int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, + const char* filename, + const int64_t file_offset) { - return pwrite(buffer, filename, false, true); + return pwrite(buffer, filename, false, true, file_offset); } at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem, diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h index 4fedf80808189..dfcb4125ab9ad 100644 --- a/csrc/aio/py_lib/deepspeed_py_io_handle.h +++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h @@ -38,27 +38,35 @@ struct deepspeed_io_handle_t { const bool get_overlap_events() const; const int get_intra_op_parallelism() const; - int read(torch::Tensor& buffer, const char* filename, const bool validate); + int read(torch::Tensor& buffer, + const char* filename, + const bool validate, + const int64_t file_offset); - int write(const torch::Tensor& buffer, const char* filename, const bool validate); + int write(const torch::Tensor& buffer, + const char* filename, + const bool validate, + const int64_t file_offset); int pread(const torch::Tensor& buffer, const char* filename, const bool validate, - const bool async); + const bool async, + const int64_t file_offset); int pwrite(const torch::Tensor& buffer, const char* filename, const bool validate, - const bool async); + const bool async, + const int64_t file_offset); - int sync_pread(torch::Tensor& buffer, const char* filename); + int sync_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset); - int sync_pwrite(const torch::Tensor& buffer, const char* filename); + int sync_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset); - int async_pread(torch::Tensor& buffer, const char* filename); + int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset); - int async_pwrite(const torch::Tensor& buffer, const char* filename); + int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset); // TODO: Make API's args to be shape and dtype. torch::Tensor new_cpu_locked_tensor(const int64_t num_elem, @@ -81,5 +89,6 @@ struct deepspeed_io_handle_t { const int fd, const char* filename, const int64_t file_num_bytes, - const bool validate); + const bool validate, + const int64_t file_offset); }; diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp index b80fa2d6c8e6d..bf298b691b814 100644 --- a/csrc/aio/py_lib/py_ds_aio.cpp +++ b/csrc/aio/py_lib/py_ds_aio.cpp @@ -40,14 +40,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "Synchronous and non-parallel file read. Returns count of completed read ops", "buffer"_a, "filename"_a, - "validate"_a) + "validate"_a, + "file_offset"_a = 0) .def("write", &deepspeed_aio_handle_t::write, "Synchronous and non-parallel file write. Returns count of completed write ops", "buffer"_a, "filename"_a, - "validate"_a) + "validate"_a, + "file_offset"_a = 0) .def("pread", &deepspeed_aio_handle_t::pread, @@ -55,7 +57,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "buffer"_a, "filename"_a, "validate"_a, - "async"_a) + "async"_a, + "file_offset"_a = 0) .def("pwrite", &deepspeed_aio_handle_t::pwrite, @@ -63,33 +66,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "buffer"_a, "filename"_a, "validate"_a, - "async"_a) + "async"_a, + "file_offset"_a = 0) .def("sync_pread", &deepspeed_aio_handle_t::sync_pread, "Synchrononous parallel file read. Returns count of completed read ops", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite, "Synchronous parallel file write. Returns count of completed write ops", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("async_pread", &deepspeed_aio_handle_t::async_pread, "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and " "following wait() returns count of completed ops.", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite, "Asynchronous parallel file write. Returns 0 on success, and following wait() returns " "count of completed ops.", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor, diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index f4a179deb9ec6..6913e9090bf57 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -92,7 +92,7 @@ def main_parallel_read(pool_params): start_time = time.time() dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER - ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True) + ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True) assert ret != -1 handle.wait() if dest_buffer == BOUNCE_BUFFER: diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp index f49f74394374e..b7055c8cc72b3 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.cpp +++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp @@ -95,8 +95,16 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op, const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate) - : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate) + const bool validate, + const int64_t file_offset) + : io_op_desc_t(read_op, + buffer, + fd, + filename, + file_num_bytes, + intra_op_parallelism, + validate, + file_offset) { _contiguous_buffer = _buffer.contiguous(); const int64_t device = _buffer.get_device(); @@ -124,17 +132,17 @@ void gds_op_desc_t::run(const int tid, { assert(tid < _intra_op_parallelism); check_cudaruntimecall(cudaSetDevice(_buffer.get_device())); - int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr; - const auto file_offset = _num_bytes_per_thread * tid; + const auto buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr; + const auto tid_file_offset = _file_offset + (_num_bytes_per_thread * tid); if (_read_op) { auto ret = - cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); - if (ret < 0) { _report_error(ret, errno, buf_offset); } + cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, tid_file_offset); } } else { auto ret = - cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset); - if (ret < 0) { _report_error(ret, errno, buf_offset); } + cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset); + if (ret < 0) { _report_error(ret, errno, tid_file_offset); } } } diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h index 380bb0b9b6aed..d955527b1ba33 100644 --- a/csrc/gds/py_lib/deepspeed_gds_op.h +++ b/csrc/gds/py_lib/deepspeed_gds_op.h @@ -24,7 +24,8 @@ struct gds_op_desc_t : io_op_desc_t { const char* filename, const int64_t file_num_bytes, const int intra_op_parallelism, - const bool validate); + const bool validate, + const int64_t file_offset); void run(const int tid, std::unique_ptr& aio_ctxt, diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp index c052144a0190b..f11245c75a5e6 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp @@ -107,12 +107,19 @@ std::shared_ptr deepspeed_gds_handle_t::_create_io_op_desc( const int fd, const char* filename, const int64_t file_num_bytes, - const bool validate) + const bool validate, + const int64_t file_offset) { if (buffer.is_cuda()) { - return std::make_shared( - read_op, buffer, fd, filename, file_num_bytes, _intra_op_parallelism, validate); + return std::make_shared(read_op, + buffer, + fd, + filename, + file_num_bytes, + _intra_op_parallelism, + validate, + file_offset); } return deepspeed_io_handle_t::_create_io_op_desc( - read_op, buffer, fd, filename, file_num_bytes, validate); + read_op, buffer, fd, filename, file_num_bytes, validate, file_offset); } diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h index 131e83e7b838a..25f68e177b2cb 100644 --- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h +++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h @@ -42,7 +42,8 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t { const int fd, const char* filename, const int64_t file_num_bytes, - const bool validate); + const bool validate, + const int64_t file_offset); static int s_cuFile_init; }; diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp index 57bf8d2207c45..2f165ee2c32a4 100644 --- a/csrc/gds/py_lib/py_ds_gds.cpp +++ b/csrc/gds/py_lib/py_ds_gds.cpp @@ -33,14 +33,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "Synchronous and non-parallel file read. Returns count of completed read ops", "buffer"_a, "filename"_a, - "validate"_a) + "validate"_a, + "file_offset"_a = 0) .def("write", &deepspeed_gds_handle_t::write, "Synchronous and non-parallel file write. Returns count of completed write ops", "buffer"_a, "filename"_a, - "validate"_a) + "validate"_a, + "file_offset"_a = 0) .def("pread", &deepspeed_gds_handle_t::pread, @@ -48,7 +50,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "buffer"_a, "filename"_a, "validate"_a, - "async"_a) + "async"_a, + "file_offset"_a = 0) .def("pwrite", &deepspeed_gds_handle_t::pwrite, @@ -56,33 +59,38 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) "buffer"_a, "filename"_a, "validate"_a, - "async"_a) + "async"_a, + "file_offset"_a = 0) .def("sync_pread", &deepspeed_gds_handle_t::sync_pread, "Synchrononous parallel file read. Returns count of completed read ops", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("sync_pwrite", &deepspeed_gds_handle_t::sync_pwrite, "Synchronous parallel file write. Returns count of completed write ops", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("async_pread", &deepspeed_gds_handle_t::async_pread, "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and " "following wait() returns count of completed ops.", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("async_pwrite", &deepspeed_gds_handle_t::async_pwrite, "Asynchronous parallel file write. Returns 0 on success, and following wait() returns " "count of completed ops.", "buffer"_a, - "filename"_a) + "filename"_a, + "file_offset"_a = 0) .def("new_cpu_locked_tensor", &deepspeed_gds_handle_t::new_cpu_locked_tensor, diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h index c06d3a6b35e93..6f500250f033a 100644 --- a/csrc/includes/cpu_adagrad.h +++ b/csrc/includes/cpu_adagrad.h @@ -14,9 +14,9 @@ #include "simd.h" #define STEP(SPAN) \ - template \ - void Step_##SPAN(ds_params_percision_t* _params, \ - ds_params_percision_t* grads, \ + template \ + void Step_##SPAN(ds_params_precision_t* _params, \ + ds_params_precision_t* grads, \ ds_state_precision_t* _exp_avg_sq, \ size_t _param_size); @@ -28,10 +28,10 @@ class Adagrad_Optimizer { } ~Adagrad_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg_sq, size_t param_size); #endif @@ -61,15 +61,15 @@ class Adagrad_Optimizer { }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Adagrad_Optimizer::Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg_sq, size_t _param_size) { #if !defined(__AVX512__) - if (std::is_same_v || + if (std::is_same_v || std::is_same_v) { return; } diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h index faf99020aee52..a7db6fda37054 100644 --- a/csrc/includes/cpu_adam.h +++ b/csrc/includes/cpu_adam.h @@ -14,9 +14,9 @@ #include "simd.h" #define STEP(SPAN) \ - template \ - void Step_##SPAN(ds_params_percision_t* _params, \ - ds_params_percision_t* grads, \ + template \ + void Step_##SPAN(ds_params_precision_t* _params, \ + ds_params_precision_t* grads, \ ds_state_precision_t* _exp_avg, \ ds_state_precision_t* _exp_avg_sq, \ size_t _param_size); @@ -43,10 +43,10 @@ class Adam_Optimizer { ~Adam_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, ds_state_precision_t* _exp_avg_sq, size_t param_size); @@ -106,16 +106,16 @@ class Adam_Optimizer { }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Adam_Optimizer::Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, ds_state_precision_t* _exp_avg_sq, size_t _param_size) { #if !defined(__AVX512__) - if (std::is_same_v || + if (std::is_same_v || std::is_same_v) { return; } diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h index 62b3049232222..beaf357a32117 100644 --- a/csrc/includes/cpu_lion.h +++ b/csrc/includes/cpu_lion.h @@ -14,9 +14,9 @@ #include "simd.h" #define STEP(SPAN) \ - template \ - void Step_##SPAN(ds_params_percision_t* _params, \ - ds_params_percision_t* grads, \ + template \ + void Step_##SPAN(ds_params_precision_t* _params, \ + ds_params_precision_t* grads, \ ds_state_precision_t* _exp_avg, \ size_t _param_size); @@ -32,10 +32,10 @@ class Lion_Optimizer { ~Lion_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, size_t param_size); #endif @@ -67,15 +67,15 @@ class Lion_Optimizer { }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Lion_Optimizer::Step_AVX(size_t* rounded_size, - ds_params_percision_t* _params, - ds_params_percision_t* grads, + ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, size_t _param_size) { #if !defined(__AVX512__) - if (std::is_same_v || + if (std::is_same_v || std::is_same_v) { return; } diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp index 85896ba86e194..6a98162314f9f 100644 --- a/csrc/lion/cpu_lion_impl.cpp +++ b/csrc/lion/cpu_lion_impl.cpp @@ -19,9 +19,9 @@ static std::unordered_map> s_optimizers; // C++ interface -template -void Lion_Optimizer::Step_1(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Lion_Optimizer::Step_1(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, size_t _param_size) { @@ -64,9 +64,9 @@ void Lion_Optimizer::Step_1(ds_params_percision_t* _params, } } -template -void Lion_Optimizer::Step_4(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Lion_Optimizer::Step_4(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, size_t _param_size) { @@ -117,9 +117,9 @@ int create_lion_optimizer(int optimizer_id, return 0; } -template -void Lion_Optimizer::Step_8(ds_params_percision_t* _params, - ds_params_percision_t* grads, +template +void Lion_Optimizer::Step_8(ds_params_precision_t* _params, + ds_params_precision_t* grads, ds_state_precision_t* _exp_avg, size_t _param_size) { @@ -134,15 +134,15 @@ void Lion_Optimizer::Step_8(ds_params_percision_t* _params, (_param_size - rounded_size)); } -template +template void step_invoker(std::shared_ptr opt, void* _params, void* grads, void* _exp_avg, size_t _param_size) { - opt->Step_8((ds_params_percision_t*)(_params), - (ds_params_percision_t*)(grads), + opt->Step_8((ds_params_precision_t*)(_params), + (ds_params_precision_t*)(grads), (ds_state_precision_t*)(_exp_avg), _param_size); } @@ -152,12 +152,12 @@ std::map, invokers; // Fill map with template functions for each type -template +template void create_invoker() { - invokers[std::tuple(c10::CppTypeToScalarType(), + invokers[std::tuple(c10::CppTypeToScalarType(), c10::CppTypeToScalarType())] = - step_invoker; + step_invoker; } struct InvokerInitializer { InvokerInitializer() diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu index 25a494111c54b..bbb8a7f00b1f5 100644 --- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu +++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu @@ -101,7 +101,15 @@ __global__ void apply_rotary_pos_half(T* mixed_query, #if defined(__HIP_PLATFORM_AMD__) and ROCM_WAVEFRONT_SIZE == 64 #define LAUNCH_FOR_ALIGNMENT(ALIGNMENT) \ - if (threads_per_head == 64) { \ + if (threads_per_head == 4) { \ + LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT); \ + } else if (threads_per_head == 8) { \ + LAUNCH_ROT_POS_EMB_HALF(8, ALIGNMENT); \ + } else if (threads_per_head == 16) { \ + LAUNCH_ROT_POS_EMB_HALF(16, ALIGNMENT); \ + } else if (threads_per_head == 32) { \ + LAUNCH_ROT_POS_EMB_HALF(32, ALIGNMENT); \ + } else if (threads_per_head == 64) { \ LAUNCH_ROT_POS_EMB_HALF(64, ALIGNMENT); \ } else { \ assert(false); \ diff --git a/csrc/xpu/adam/multi_tensor_apply.dp.hpp b/csrc/xpu/adam/multi_tensor_apply.dp.hpp index 01964e1a4d385..14a130e2a23e9 100644 --- a/csrc/xpu/adam/multi_tensor_apply.dp.hpp +++ b/csrc/xpu/adam/multi_tensor_apply.dp.hpp @@ -109,6 +109,12 @@ class multi_tensor_apply_kernel { std::tuple args; }; +// to make sure multi_tensor_apply_kernel can be used in sycl::buffer +namespace sycl { +template +struct is_device_copyable> : std::true_type {}; +} // namespace sycl + template void multi_tensor_apply(int block_size, int chunk_size, diff --git a/csrc/xpu/aio/deepspeed_cpu_op.cpp b/csrc/xpu/aio/deepspeed_cpu_op.cpp deleted file mode 100644 index ee98c2d5cac2f..0000000000000 --- a/csrc/xpu/aio/deepspeed_cpu_op.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// SPDX-License-Identifier: Apache-2.0 - -// DeepSpeed Team - -#include "deepspeed_cpu_op.h" - -using namespace std; - -cpu_op_desc_t::cpu_op_desc_t(const bool read_op, - const torch::Tensor& buffer, - const int fd, - const char* filename, - const long long int file_num_bytes, - const int num_threads, - const bool validate) - : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate), - _cpu_buffer(buffer) -{ - // XPU don't handle buffer here. See XPU Accelerator pin_memory. - _contiguous_buffer = _cpu_buffer.contiguous(); -} - -char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); } - -void cpu_op_desc_t::finish() -{ - if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); } -} - -void cpu_op_desc_t::validate() -{ - validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes); -} - -void cpu_op_desc_t::run(const int tid, - std::unique_ptr& aio_ctxt, - deepspeed_aio_config_t* aio_config) -{ - assert(tid < _num_threads); - const auto base_offset = _num_bytes_per_thread * tid; - - std::unique_ptr xfer_ctxt( - new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr())); - - if (aio_config->_overlap_events) { - do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr); - } else { - do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr); - } -} diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index d8655299282f7..a8d15cd5332bc 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -42,7 +42,7 @@ from .utils import log_dist, OnDevice, logger from .comm.comm import init_distributed -from .runtime import zero +from .runtime import zero, domino from .runtime.compiler import is_compile_supported from .pipe import PipelineModule @@ -165,8 +165,8 @@ def initialize(args=None, if hasattr(args, "deepscale_config") and args.deepscale_config is not None: logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************") if hasattr(args, "deepspeed_config"): - assert (args.deepspeed_config is - None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" + assert (args.deepspeed_config + is None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" args.deepspeed_config = args.deepscale_config args.deepscale_config = None diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md index b1fa435364d2a..fc76ed1e9f8d7 100755 --- a/deepspeed/autotuning/README.md +++ b/deepspeed/autotuning/README.md @@ -336,7 +336,7 @@ The Autotuner stops exploring the space when any of the following conditions mee ## Using Autotuning with Hugging Face -Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/transformers/main_classes/deepspeed.html?highlight=gradient_accumulation_steps#shared-configuration). +Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/docs/transformers/deepspeed#deepspeed-and-trainer-parameters). `"auto"` means the value will be set to the default in Hugging Face or be overwritten using the supplied values from the command line arguments. In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has "auto" keywords, they are treated as the value "auto". diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py index dfd195bc37ebd..a72b3c951e973 100755 --- a/deepspeed/autotuning/autotuner.py +++ b/deepspeed/autotuning/autotuner.py @@ -248,8 +248,8 @@ def mp_size(self): return self.autotuning_config.mp_size def max_train_micro_batch_size_per_gpu(self): - if self.max_train_batch_size( - ) and self.max_train_batch_size() > 0: # if the user specifies a max_train_batch_size + if self.max_train_batch_size() and self.max_train_batch_size( + ) > 0: # if the user specifies a max_train_batch_size max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // ( self.exp_num_gpus * self.exp_num_nodes) # gradient accumulation steps >=1 return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size) @@ -964,8 +964,8 @@ def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_m low = mid + 1 self.update_records(tuning_space_name, exp, metric_val, 1) used_micro_batch_sizes.append(mid) - if prev_metric_val and ( - (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST: + if prev_metric_val and ((metric_val - prev_metric_val) / + prev_metric_val) < METRIC_PERCENT_DIFF_CONST: logger.info(f"performance plateaus at mbs = {low}") break prev_metric_val = metric_val @@ -1026,8 +1026,8 @@ def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} )) # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) )) # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) )) - if self.max_train_batch_size( - ) and self.max_train_batch_size() > 0: # if the user specifies a max_train_batch_size + if self.max_train_batch_size() and self.max_train_batch_size( + ) > 0: # if the user specifies a max_train_batch_size max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus * self.exp_num_nodes) else: diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py index 31997177a2623..9a368b7a0a254 100644 --- a/deepspeed/checkpoint/deepspeed_checkpoint.py +++ b/deepspeed/checkpoint/deepspeed_checkpoint.py @@ -116,7 +116,7 @@ def show_transformer_file_map(self): self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files') def _build_global_state(self): - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) + sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None) @@ -137,14 +137,17 @@ def get_final_norm_layer_id(self): def get_iteration(self): if not ITERATION_KEY in self.global_state: - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) + sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) return self.global_state[ITERATION_KEY] def get_embedding_state(self, tp_index: int) -> Dict: assert tp_index in self.tp_to_embedding_map.keys() - sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]] + sd_list = [ + torch.load(fname, map_location=torch.device('cpu'), weights_only=False) + for fname in self.tp_to_embedding_map[tp_index] + ] sd = self._merge_state_dicts(sd_list) return sd @@ -154,7 +157,7 @@ def get_embedding_files(self, tp_index: int) -> list: def _get_checkpoint_value(self, key): if not key in self.global_state: - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) + sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) self.global_state[key] = sd.get(key, None) return self.global_state[key] @@ -169,7 +172,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict: assert tp_index < self.tp_degree assert pp_index < self.pp_degree fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index) - sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list] + sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list] merged_sd = None for sd in sd_list: @@ -185,7 +188,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list: assert pp_index < self.pp_degree t_list = [] for fname_list in self.transformer_file_map[(tp_index, pp_index)]: - sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list] + sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list] sd = self._merge_state_dicts(sd_list) t_list.append(sd) return t_list @@ -196,7 +199,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list: def get_final_norm_state(self, tp_index: int) -> Dict: assert tp_index in self.tp_to_final_norm_map.keys() - sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu')) + sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'), weights_only=False) return sd def get_final_norm_files(self, tp_index: int) -> list: diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py index e5974a30df22e..f7b75eee66d06 100755 --- a/deepspeed/checkpoint/ds_to_universal.py +++ b/deepspeed/checkpoint/ds_to_universal.py @@ -150,7 +150,7 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D): def extract_zero_shards_stage3(optim_files, param_shapes, dp_degree, temp_dir, dp_index): - state_dict = torch.load(optim_files[dp_index], map_location='cpu') + state_dict = torch.load(optim_files[dp_index], map_location='cpu', weights_only=False) flat_state = dict( exp_avg=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg"], @@ -214,7 +214,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None): raise ValueError(f"Cannot parse dp_rank from {p}") paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))] - shards = [torch.load(p) for p in paths] + shards = [torch.load(p, weights_only=False) for p in paths] if state == "step": assert all(v == shards[0] for v in shards), "All shards must have the same step value" @@ -404,7 +404,7 @@ def _zero_partitioned_param_info(unpartitioned_numel, world_size): def _parse_model_states_stage3(files): - return torch.load(files[0], map_location=torch.device('cpu'))[PARAM_SHAPES] + return torch.load(files[0], map_location=torch.device('cpu'), weights_only=False)[PARAM_SHAPES] def _save_optimizer_state(args, ds_checkpoint): @@ -420,7 +420,7 @@ def _save_optimizer_state(args, ds_checkpoint): def _save_optimizer_state_stage3(args, optim_files): - sd = torch.load(optim_files[0], map_location=torch.device('cpu')) + sd = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False) output_sd = sd[OPTIMIZER_STATE_DICT] output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS] zero_output_folder = os.path.join(args.output_folder, "zero") @@ -446,7 +446,7 @@ def _get_checkpoint_files(checkpoint_dir, glob_pattern): def _get_zero_stage(optim_files): - state_dict = torch.load(optim_files[0], map_location=torch.device('cpu')) + state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False) optimizer_state = state_dict[OPTIMIZER_STATE_DICT] zero_stage = optimizer_state.get(ZERO_STAGE, 1) return zero_stage @@ -454,7 +454,7 @@ def _get_zero_stage(optim_files): def _inject_missing_state(ds_checkpoint): if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state: - sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu')) + sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False) if UNIVERSAL_CHECKPOINT_INFO not in sd: ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {} ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][ @@ -488,7 +488,7 @@ def main(args): slice_shapes = [] for mp_rank_file in ds_checkpoint.mp_rank_files: - mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu')) + mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'), weights_only=False) slice_shapes += mp_sd[PARAM_SHAPES] # fix back to normal flat dict, merge duplicates for tp>1 diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py index 064891a8bb540..266d5a0635951 100644 --- a/deepspeed/checkpoint/universal_checkpoint.py +++ b/deepspeed/checkpoint/universal_checkpoint.py @@ -34,7 +34,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size): step = None for key in hp_keys: ckpt_file = os.path.join(folder, f"{key}.pt") - ckpt_dict = torch.load(ckpt_file) + ckpt_dict = torch.load(ckpt_file, weights_only=False) if key == "step": step = ckpt_dict diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py index c305e8884e83b..5964da00728e4 100644 --- a/deepspeed/checkpoint/utils.py +++ b/deepspeed/checkpoint/utils.py @@ -51,7 +51,12 @@ def clone_tensors_for_torch_save(item, device=torch.device('cpu')): - copy of ``item`` with cloned tensors on target device """ if torch.is_tensor(item): - return item.detach().clone().to(device) + if type(device) is str: + device = torch.device(device) + if device == item.device: + return item.detach().clone() + else: + return item.detach().to(device) elif isinstance(item, list): return [clone_tensors_for_torch_save(v, device) for v in item] elif isinstance(item, tuple): diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py index 6730b93dfd4fb..c85f0241005d1 100644 --- a/deepspeed/checkpoint/zero_checkpoint.py +++ b/deepspeed/checkpoint/zero_checkpoint.py @@ -54,7 +54,7 @@ def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], st state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index) merged_sd = None for state_file in state_file_list: - sd = torch.load(state_file, map_location=torch.device('cpu')) + sd = torch.load(state_file, map_location=torch.device('cpu'), weights_only=False) for key in keys_to_ignore: sd.pop(key, None) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index ed2645d415c41..988b74232bb92 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -390,9 +390,14 @@ def init_device_mesh(self, mesh_shape, mesh_dim_names): if not required_torch_version(min_version=2.2): raise RuntimeError(f"Current torch version does not have device mesh" f"api (torch.__version__: {torch.__version__})") - return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(), - mesh_shape, - mesh_dim_names=mesh_dim_names) + if not required_torch_version(max_version=2.4): + return torch.distributed.device_mesh.init_device_mesh(get_accelerator().device_name(), + mesh_shape, + mesh_dim_names=mesh_dim_names) + else: + return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(), + mesh_shape, + mesh_dim_names=mesh_dim_names) # This will become a light-weight wrapper around torch.distributed functions diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py index c6a69dd2a49fb..8fd4293d312c0 100644 --- a/deepspeed/elasticity/elastic_agent.py +++ b/deepspeed/elasticity/elastic_agent.py @@ -160,8 +160,8 @@ def _invoke_run(self, role: str = "default") -> RunResult: f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.") self._exit_barrier() return run_result - elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED - } or len(participants) > len(rdzv_handler._state_holder.state.participants): + elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED} or len(participants) > len( + rdzv_handler._state_holder.state.participants): if self._remaining_restarts > 0: log.info(f"[{role}] Worker group {state.name}. " f"{self._remaining_restarts}/{spec.max_restarts} attempts left;" diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index 6574d49fb1323..cfca1ff4fe4cf 100755 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -452,7 +452,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None): checkpoint = sd_loader['checkpoints'] if type(checkpoint) is list: - self.sd = torch.load(checkpoint[0], map_location='cpu') + self.sd = torch.load(checkpoint[0], map_location='cpu', weights_only=False) self.key_list = list(self.sd.keys()) self.load_model_with_checkpoint(self.module) @@ -460,7 +460,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None): for i in range(1, len(checkpoint)): if not dist.is_initialized() or dist.get_rank() == 0: print(f"loading checkpoint ({i})") - self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name()) + self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name(), weights_only=False) self.key_list = list(self.sd.keys()) self.load_model_with_checkpoint(self.module) else: diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py index d88d99ebebfd1..b17bb886838f1 100644 --- a/deepspeed/inference/v2/checkpoint/huggingface_engine.py +++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py @@ -80,7 +80,7 @@ def model_has_safetensors(model_name_or_path: str) -> bool: else: model_param_json_fname = "pytorch_model.bin.index.json" model_file_fname = "pytorch_model.bin" - self._checkpoint_load_fn = partial(torch.load, map_location="cpu") + self._checkpoint_load_fn = partial(torch.load, map_location="cpu", weights_only=False) model_param_json = os.path.join(self._local_checkpoint_dir, model_param_json_fname) diff --git a/deepspeed/inference/v2/model_implementations/inference_policy_base.py b/deepspeed/inference/v2/model_implementations/inference_policy_base.py index d5a326c03599e..2f4266a8cb880 100644 --- a/deepspeed/inference/v2/model_implementations/inference_policy_base.py +++ b/deepspeed/inference/v2/model_implementations/inference_policy_base.py @@ -205,7 +205,7 @@ def populate_model_parameters(self) -> None: buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size) metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size) - buffer = torch.load(buffer_path) + buffer = torch.load(buffer_path, weights_only=False) metadata = json.load(open(metadata_path, "r")) metadata = ModelMetadata.parse_raw(metadata) diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py index f26c875566650..feb65b4a5f5d1 100644 --- a/deepspeed/inference/v2/model_implementations/layer_container_base.py +++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py @@ -14,7 +14,7 @@ # Currently have dependency loops for the type hints. InferenceModel = Type["InferenceModel"] -LayerContainer = Type["LayerContainer"] +LayerContainer = Type["LayerContainer"] # noqa: F811 MAPPING_KEY = "PARAM_MAPPING" PLIST_HELPERS = "_ds_plist_strip_vals" @@ -161,7 +161,7 @@ def __call__(cls, *args, **kwargs): return instance -class LayerContainer(metaclass=LayerMetaclass): +class LayerContainer(metaclass=LayerMetaclass): # noqa: F811 """ Abstract base class for containing model parameters. diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index cf70c4530c822..7afe6ca903fb2 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -277,8 +277,10 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None): if hasattr(model_config, "vision_config"): if "MllamaVisionEncoderLayer" in str(module): num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config) - else: + elif hasattr(model_config, "text_config"): num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config) + else: + num_kv_heads = _autotp.get_model_num_kv_heads(model_config) else: num_kv_heads = _autotp.get_model_num_kv_heads(model_config) @@ -345,7 +347,7 @@ def set_lm_head(module): "weight") and not module.embed_out.weight.is_meta and isinstance( module.embed_out, torch.nn.Linear): module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out") - elif hasattr(module.language_model, "lm_head"): + elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"): module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head") return module @@ -413,7 +415,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size): pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards") for i in range(len(checkpoint)): - sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')] + sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu', weights_only=False)] load_model_with_checkpoint(replaced_module, sd, mp_replace, @@ -435,7 +437,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size): os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j] for j in range(sd_count) ] - sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files] + sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False) for ckpt_file in ckpt_files] load_model_with_checkpoint(replaced_module, sds, mp_replace, @@ -455,7 +457,7 @@ def conv2d_parallel_shard_weights(model, rank, world_size): pbar.update(1) ckpt_file = os.path.join(base_dir1, checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i] - sds = [torch.load(ckpt_file, map_location='cpu')] + sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False)] load_model_with_checkpoint(replaced_module, sds, mp_replace, @@ -494,9 +496,10 @@ def conv2d_parallel_shard_weights(model, rank, world_size): if not dist.is_initialized() or dist.get_rank() == 0: print("Saving tp-sharded checkpoints") torch.save( - OrderedDict({k: v - for k, v in dict(replaced_module.state_dict()).items() - if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}') + OrderedDict({ + k: v + for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k + }), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}') dtype_reprs = { torch.float32: 'float32', @@ -621,7 +624,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No from safetensors.torch import load_file sd = load_file(checkpoint) else: - sd = torch.load(checkpoint, map_location='cpu') + sd = torch.load(checkpoint, map_location='cpu', weights_only=False) policy = {} if orig_class is not None: diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py index 6758c7a657f63..57be0c7938560 100644 --- a/deepspeed/module_inject/tp_shard.py +++ b/deepspeed/module_inject/tp_shard.py @@ -24,7 +24,9 @@ def set_n_embd(num): def get_num_kv_heads(): global num_kv_heads - return num_kv_heads + if 'num_kv_heads' in globals(): + return num_kv_heads + return None def get_num_attention_heads(): diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py index 96306184e42cf..f87f1beb7e4eb 100644 --- a/deepspeed/profiling/flops_profiler/profiler.py +++ b/deepspeed/profiling/flops_profiler/profiler.py @@ -115,7 +115,7 @@ def start_time_hook(module, input): get_accelerator().synchronize() module.__start_time__ = time.time() - if not hasattr(module, "__start_time_hook_handle"): + if not hasattr(module, "__start_time_hook_handle__"): module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook) def end_time_hook(module, input, output): diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py index 6cfd66f1cc38a..b8df7499450db 100644 --- a/deepspeed/runtime/base_optimizer.py +++ b/deepspeed/runtime/base_optimizer.py @@ -22,7 +22,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, chec optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt") assert os.path.isfile( optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.' - optim_sd = torch.load(optim_state_path) + optim_sd = torch.load(optim_state_path, weights_only=False) self._load_global_state(optim_sd) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index e26e3243c4b5d..e834bf0d22d79 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -58,7 +58,7 @@ def load(self, path: str, map_location=None): if not self.enable_nebula_load and first_load_flag: self.tag_flag = tag logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...") - partition = torch.load(path, map_location=map_location) + partition = torch.load(path, map_location=map_location, weights_only=False) logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .") return partition diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py index 5cd44864bb2ea..076c638532ad3 100644 --- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py @@ -25,7 +25,7 @@ def save(self, state_dict, path: str): def load(self, path: str, map_location=None): logger.info(f"[Torch] Loading checkpoint from {path}...") - partition = torch.load(path, map_location=map_location) + partition = torch.load(path, map_location=map_location, weights_only=False) logger.info(f"[Torch] Loaded checkpoint from {path}.") return partition diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py index 879c0a1a2cc9c..fa9220f4fcd0f 100644 --- a/deepspeed/runtime/compiler.py +++ b/deepspeed/runtime/compiler.py @@ -5,6 +5,15 @@ import torch +try: + from torch.compiler import is_compiling as torch_is_compiling +except ImportError: + try: + from torch._dynamo.external_utils import is_compiling as torch_is_compiling + except ImportError: + # Torch does not have compiler support + torch_is_compiling = lambda: False + def is_compile_supported(): return hasattr(torch, "compiler") and hasattr(torch.nn.Module, "compile") @@ -14,3 +23,7 @@ def disable(func): if is_compile_supported(): return torch.compiler.disable(func) return func + + +def is_compiling(): + return torch_is_compiling() diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index 8be2f7ac40557..fb786f29722d7 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -1012,8 +1012,8 @@ def _do_error_check(self): self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS) if self.zero_enabled: - assert (self.zero_optimization_stage <= - ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format( + assert (self.zero_optimization_stage + <= ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format( ZeroStageEnum.max_stage) if self.fp16_master_weights_and_gradients: diff --git a/deepspeed/runtime/domino/__init__.py b/deepspeed/runtime/domino/__init__.py new file mode 100644 index 0000000000000..208299fb8c50f --- /dev/null +++ b/deepspeed/runtime/domino/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team diff --git a/deepspeed/runtime/domino/transformer.py b/deepspeed/runtime/domino/transformer.py new file mode 100644 index 0000000000000..8eb95e49c29d0 --- /dev/null +++ b/deepspeed/runtime/domino/transformer.py @@ -0,0 +1,518 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter +import deepspeed +from deepspeed import comm as dist +from deepspeed.accelerator import get_accelerator + + +def is_rank_0(): + if dist.get_rank() == 0: + return True + + +class DominoModule(torch.nn.Module): + """extensions of torch Module.""" + + def __init__(self, ): + super(DominoModule, self).__init__() + + +import enum + + +class LayerType(enum.Enum): + encoder = 1 + decoder = 2 + + +class AttnType(enum.Enum): + self_attn = 1 + cross_attn = 2 + + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 + + +class ModelType(enum.Enum): + encoder_or_decoder = 1 + encoder_and_decoder = 2 + + +handle_dic = {} + + +def no_oper(input_, dic_, h_id): + return NoOper.apply(input_, dic_, h_id) + + +class NoOper(torch.autograd.Function): + + @staticmethod + def symbolic(graph, input_, handle_dic, h_id): + return input_ + + @staticmethod + def forward(ctx, input_, handle_dic, h_id): + ctx.handle_dic = handle_dic + ctx.h_id = h_id + return input_ + + @staticmethod + def backward(ctx, grad_output): + handle = ctx.handle_dic[ctx.h_id] + handle.wait() + return grad_output, None, None + + +def copy_to_tensor_model_parallel_region_a(mpu, input_, dic_, h_id): + return _CopyToModelParallelRegionA.apply(mpu, input_, dic_, h_id) + + +class _CopyToModelParallelRegionA(torch.autograd.Function): + """Pass the input to the model parallel region.""" + + @staticmethod + def symbolic(graph, mpu, input_, handle_dic, h_id): + return input_ + + @staticmethod + def forward(ctx, mpu, input_, handle_dic, h_id): + ctx.mpu = mpu + ctx.handle_dic = handle_dic + ctx.h_id = h_id + return input_ + + @staticmethod + def backward(ctx, grad_output): + # Bypass the function if we are using only 1 GPU. + if ctx.mpu.get_tensor_model_parallel_world_size() == 1: + return grad_output + + # Async All-reduce. + handle = deepspeed.comm.all_reduce(grad_output, group=ctx.mpu.get_tensor_model_parallel_group(), async_op=True) + ctx.handle_dic[ctx.h_id] = handle + return None, grad_output, None, None + + +class CoreAttention(DominoModule): + + def __init__(self, config, layer_number, mpu, attn_mask_type=AttnMaskType.causal): + super(CoreAttention, self).__init__() + + self.layer_number = max(1, layer_number) + self.att_dropout_p = config.attention_dropout + self.is_causal = True + projection_size = config.kv_channels * config.num_attention_heads + world_size = mpu.get_tensor_model_parallel_world_size() + self.hidden_size_per_partition = projection_size // world_size + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + # attn_mask is None when is_causal=True + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, + key_layer, + value_layer, + attn_mask=None, + dropout_p=self.att_dropout_p, + is_causal=True, + scale=None) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class ShardedAttention(DominoModule): + """Sharded self-attention layer class. + Only support self attention and causal attention mask + """ + + def __init__(self, + config, + layer_number, + mpu, + ColumnParallelLinear, + RowParallelLinearNoComm, + apply_rotary_pos_emb, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.causal): + super(ShardedAttention, self).__init__() + self.layer_number = max(1, layer_number) + self.attention_type = attention_type + self.attn_mask_type = attn_mask_type + self.params_dtype = config.params_dtype + self.apply_rotary_pos_emb = apply_rotary_pos_emb + + query_projection_size = config.kv_channels * config.num_attention_heads + kv_projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + world_size = mpu.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = query_projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads // world_size + + self.query_key_value = ColumnParallelLinear(config.hidden_size, + query_projection_size + 2 * kv_projection_size, + config=config, + init_method=config.init_method, + bias=config.add_bias_linear, + gather_output=False) + + self.core_attention = CoreAttention(config, self.layer_number, mpu, self.attn_mask_type) + + self.dense = RowParallelLinearNoComm(query_projection_size, + config.hidden_size, + config=config, + init_method=config.output_layer_init_method, + bias=config.add_bias_linear, + input_is_parallel=True, + skip_bias_add=True) + + def forward(self, hidden_states, attention_mask, rotary_pos_emb=None): + # hidden_states: [s, b, h] + + # Query, Key, and Value + # Attention heads [s, b, h] --> [s, b, np * 3 * hn)] + mixed_x_layer, _ = self.query_key_value(hidden_states) + + # [s, b, np * 3 * hn] --> [s, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [s, b, np, 3 * hn] -> [b, np, s, 3*hn] + mixed_x_layer = mixed_x_layer.permute(1, 2, 0, 3).contiguous() + + # [s, b, np, 3 * hn] --> [s, b, np, hn], [s, b, np, hn], [s, b, np, hn] + (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, [ + self.hidden_size_per_attention_head, self.hidden_size_per_attention_head, + self.hidden_size_per_attention_head + ], + dim=3) + # [s, b, np, np * hn] -> [s, b, np, hn] + query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, + self.hidden_size_per_attention_head) + + # apply rotary embedding + if rotary_pos_emb is not None: + if isinstance(rotary_pos_emb, tuple): + rotary_pos_emb = rotary_pos_emb + else: + rotary_pos_emb = ((rotary_pos_emb, ) * 2) + q_pos_emb, k_pos_emb = rotary_pos_emb + query_layer = self.apply_rotary_pos_emb(query_layer, q_pos_emb) + key_layer = self.apply_rotary_pos_emb(key_layer, k_pos_emb) + + context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) + + # Output. [s, b, h] + output, bias = self.dense(context_layer) + + return output, bias + + +class DominoTransformerLayer(DominoModule): + """A domino single transformer layer. + [s, b, h] -> [s, b, h] + """ + + def __init__(self, + config, + layer_number, + mpu, + fused_layer_norm, + _initialize_affine_weight_gpu, + ColumnParallelLinear, + RowParallelLinearNoComm, + apply_rotary_pos_emb, + bias_dropout_add_fused_train, + bias_dropout_add_fused_inference, + skip_bias_add=True, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.causal, + drop_path_rate=0., + output_bias=None): + super(DominoTransformerLayer, self).__init__() + + self.llama_model = config.llama_model + self.layer_number = layer_number + self.layer_type = layer_type + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + self.bias_dropout_add_fused_train = bias_dropout_add_fused_train + self.bias_dropout_add_fused_inference = bias_dropout_add_fused_inference + self.mpu = mpu + self.output_bias = output_bias + + # Layernorm on the input data. + self.input_layernorm = fused_layer_norm(config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm) + + # Self attention. + self.self_attention = ShardedAttention(config, + layer_number, + mpu, + ColumnParallelLinear, + RowParallelLinearNoComm, + apply_rotary_pos_emb, + attention_type=AttnType.self_attn, + attn_mask_type=self_attn_mask_type) + + self.hidden_dropout = config.hidden_dropout + + # Layernorm on the attention output + self.post_attention_layernorm = fused_layer_norm(config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm) + + # ------------ init mlp start ------------ + init_method = config.init_method + output_layer_init_method = config.output_layer_init_method + self.add_bias = config.add_bias_linear + self.skip_bias_add = skip_bias_add + + ffn_hidden_size = config.ffn_hidden_size + if config.gated_linear_unit: + ffn_hidden_size *= 2 + self.output_size_c = config.ffn_hidden_size + self.input_size_c = config.hidden_size + self.input_size_r = config.ffn_hidden_size + self.output_size_r = self.input_size_c + + world_size = mpu.get_tensor_model_parallel_world_size() + self.output_size_per_partition = self.output_size_c // world_size + self.input_size_per_partition = self.input_size_r // world_size + + # Initialize weight. + self.weight_c = Parameter( + torch.empty(self.output_size_per_partition, + self.input_size_c, + device=get_accelerator().current_device_name(), + dtype=config.params_dtype)) + self.weight_r = Parameter( + torch.empty(self.output_size_r, + self.input_size_per_partition, + device=get_accelerator().current_device_name(), + dtype=config.params_dtype)) + + if config.perform_initialization: + _initialize_affine_weight_gpu(self.weight_c, init_method, partition_dim=0, stride=1) + + _initialize_affine_weight_gpu(self.weight_r, output_layer_init_method, partition_dim=1, stride=1) + + if self.add_bias: + self.bias_c = Parameter( + torch.empty(self.output_size_per_partition, + device=get_accelerator().current_device_name(), + dtype=config.params_dtype)) + self.bias_r = Parameter( + torch.empty(self.output_size_r, + device=get_accelerator().current_device_name(), + dtype=config.params_dtype)) + if config.perform_initialization: + with torch.no_grad(): + self.bias_c.zero_() + self.bias_r.zero_() + else: + self.register_parameter('bias_c', None) + self.register_parameter('bias_r', None) + + if config.swiglu: + + def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] + + self.mlp_activation_func = swiglu + else: + self.mlp_activation_func = F.gelu + # ------------ init mlp end ------------ + + def forward(self, hidden_states, attention_mask, rotary_pos_emb=None): + # hidden_states: [s, b, h] + hidden_states0, hidden_states1 = hidden_states + + layernorm_output0 = self.input_layernorm(hidden_states0) + layernorm_output1 = self.input_layernorm(hidden_states1) + + if not self.llama_model: + rotary_pos_emb = None + + attention_output0, attention_bias0 = \ + self.self_attention( + layernorm_output0, + attention_mask, + rotary_pos_emb=rotary_pos_emb) + handle0 = deepspeed.comm.all_reduce(attention_output0, + group=self.mpu.get_tensor_model_parallel_group(), + async_op=True) + + attention_output1, attention_bias1 = \ + self.self_attention( + layernorm_output1, + attention_mask, + rotary_pos_emb=rotary_pos_emb) + handle1 = deepspeed.comm.all_reduce(attention_output1, + group=self.mpu.get_tensor_model_parallel_group(), + async_op=True) + handle0.wait() + + # Residual0 connection. + if self.apply_residual_connection_post_layernorm: + residual0 = layernorm_output0 + else: + residual0 = hidden_states0 + + if self.training: + bias_dropout_add_func = self.bias_dropout_add_fused_train + else: + bias_dropout_add_func = self.bias_dropout_add_fused_inference + if attention_bias0 is not None: + attention_bias0 = attention_bias0.expand_as(residual0) + layernorm_input0 = bias_dropout_add_func(attention_output0, attention_bias0, residual0, self.hidden_dropout) + + layernorm_output0 = self.post_attention_layernorm(layernorm_input0) + layernorm_output0 = no_oper(layernorm_output0, handle_dic, f'{self.layer_number}_0') + + # Residual1 connection. + if self.apply_residual_connection_post_layernorm: + residual1 = layernorm_output1 + else: + residual1 = hidden_states1 + + if attention_bias1 is not None: + attention_bias1 = attention_bias1.expand_as(residual1) + layernorm_input1 = bias_dropout_add_func(attention_output1, attention_bias1, residual1, self.hidden_dropout) + + layernorm_output1 = self.post_attention_layernorm(layernorm_input1) + layernorm_output1 = no_oper(layernorm_output1, handle_dic, f'{self.layer_number}_1') + + # ------------ explicit mlp start ------------ + bias_c = self.bias_c if not self.skip_bias_add else None + + input0 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output0, handle_dic, + f'{self.layer_number}_0') + # Batch0 Matrix multiply. + output0 = torch.matmul(input0, self.weight_c.t()) + if bias_c is not None: + output0 = output0 + bias_c + output0 = self.mlp_activation_func(output0) + output0 = torch.matmul(output0, self.weight_r.t()) + handle2 = deepspeed.comm.all_reduce(output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True) + + handle1.wait() + + input1 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output1, handle_dic, + f'{self.layer_number}_1') + # Batch1 Matrix multiply. + output1 = torch.matmul(input1, self.weight_c.t()) + output1 = self.mlp_activation_func(output1) + if bias_c is not None: + output1 = output1 + bias_c + output1 = torch.matmul(output1, self.weight_r.t()) + deepspeed.comm.all_reduce(output1, group=self.mpu.get_tensor_model_parallel_group()) + + handle2.wait() + + output0 = output0 + self.bias_r if self.bias_r is not None else output0 + output1 = output1 + self.bias_r if self.bias_r is not None else output1 + output_bias = self.output_bias + mlp_output0, mlp_output1, mlp_bias0, mlp_bias1 = output0, output1, output_bias, output_bias + # ------------ explicit mlp end ------------ + + if self.apply_residual_connection_post_layernorm: + residual0 = layernorm_output0 + residual1 = layernorm_output1 + else: + residual0 = layernorm_input0 + residual1 = layernorm_input1 + + if mlp_bias0 is not None: + mlp_bias0 = mlp_bias0.expand_as(residual0) + mlp_bias1 = mlp_bias1.expand_as(residual1) + output0 = bias_dropout_add_func(mlp_output0, mlp_bias0, residual0, self.hidden_dropout) + output1 = bias_dropout_add_func(mlp_output1, mlp_bias1, residual1, self.hidden_dropout) + + return output0, output1 + + +class DominoTransformer(DominoModule): + """Transformer class.""" + + def __init__(self, + config, + model_type, + mpu, + fused_layer_norm, + _initialize_affine_weight_gpu, + ColumnParallelLinear, + RowParallelLinearNoComm, + apply_rotary_pos_emb, + bias_dropout_add_fused_train, + bias_dropout_add_fused_inference, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.causal, + pre_process=True, + post_process=True, + post_layer_norm=True, + drop_path_rate=0.0): + super(DominoTransformer, self).__init__() + + self.layer_type = layer_type + self.model_type = model_type + self.post_process = post_process + self.post_layer_norm = post_layer_norm + self.num_layers = config.num_layers + self.drop_path_rate = drop_path_rate + self.drop_path_rates = [rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)] + + def build_layer(layer_number): + return DominoTransformerLayer(config, + layer_number, + mpu, + fused_layer_norm, + _initialize_affine_weight_gpu, + ColumnParallelLinear, + RowParallelLinearNoComm, + apply_rotary_pos_emb, + bias_dropout_add_fused_train, + bias_dropout_add_fused_inference, + layer_type=layer_type, + self_attn_mask_type=self_attn_mask_type, + drop_path_rate=self.drop_path_rates[layer_number - 1]) + + self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_process and self.post_layer_norm: + self.final_layernorm = fused_layer_norm(config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm) + + def forward(self, hidden_states, attention_mask, rotary_pos_emb=None): + # hidden_states: [s, b, h] + + for index in range(self.num_layers): + layer = self.layers[index] + hidden_states = layer(hidden_states, attention_mask, rotary_pos_emb) + + hidden_states0, hidden_states1 = hidden_states + if self.post_process and self.post_layer_norm: + hidden_states0 = self.final_layernorm(hidden_states0) + hidden_states1 = self.final_layernorm(hidden_states1) + + return hidden_states0, hidden_states1 diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py index 36300eb904ddc..a82d8b1d5c7a0 100755 --- a/deepspeed/runtime/eigenvalue.py +++ b/deepspeed/runtime/eigenvalue.py @@ -114,8 +114,8 @@ def compute_eigenvalue(self, module, device=None, scale=1.0): eigenvalue_current, eigenvalue_previous = 1., 0. while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs( - (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >= - self.tol): # test convergence criteria + (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) + >= self.tol): # test convergence criteria eigenvalue_previous = eigenvalue_current Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 05bb23e8ddd94..8c5da36e5a787 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -17,6 +17,7 @@ from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from contextlib import contextmanager from typing import Callable, Dict, Union, Iterable, Container @@ -216,6 +217,7 @@ def __init__(self, self.loaded_checkpoint_mp_world_size = None self.loaded_checkpoint_dp_world_size = None self.enable_backward_allreduce = True + self.inside_no_sync_ctxt = False self.progressive_layer_drop = None self.eigenvalue = None self.block_eigenvalue = None @@ -811,6 +813,9 @@ def zero_max_reuse_distance(self): def zero_prefetch_bucket_size(self): return self._config.zero_config.prefetch_bucket_size + def zero_module_granularity_threshold(self): + return self._config.zero_config.module_granularity_threshold + def zero_param_persistence_threshold(self): return self._config.zero_config.param_persistence_threshold @@ -1077,7 +1082,10 @@ def _supported_optims(self): # Validate configuration based on command line arguments def _do_sanity_check(self): if self.fp16_enabled() and not get_accelerator().is_fp16_supported(): - raise ValueError("Type fp16 is not supported.") + raise ValueError("Type fp16 is not supported on your device.") + + if self.bfloat16_enabled() and not get_accelerator().is_bf16_supported(): + raise ValueError("Type bf16 is not supported on your device.") expected_optim_types = self._supported_optims() expected_optim_types += [type(None), Callable] @@ -1611,6 +1619,7 @@ def _configure_zero_optimizer(self, optimizer): zero_param_parallel_group=zero_param_parallel_group, zero_quantized_weights=self.zero_quantized_weights(), zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(), + zero_module_granularity_threshold=self.zero_module_granularity_threshold(), ) else: log_dist( @@ -1657,6 +1666,7 @@ def _configure_zero_optimizer(self, optimizer): zero_hpz_partition_size=self.zero_hpz_partition_size(), zero_quantized_weights=self.zero_quantized_weights(), zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(), + zero_module_granularity_threshold=self.zero_module_granularity_threshold(), ) else: @@ -1973,12 +1983,31 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): grads = None self.buffered_allreduce_fallback(grads=grads, elements_per_buffer=bucket_size) + @contextmanager + def no_sync(self): + r""" + Context manager to disable gradient reduction during backward pass. + This context manager has the following effects on other DeepSpeed features. + 1. Incompatible with ZeRO stage 2/3 which rely on reduction for gradient partitioning. + 2. It is illegal to call engine.step() within the context manager. + 3. Tracking of gradient accumulation steps is disabled. + """ + assert not self.zero_optimization_partition_gradients(), \ + f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}" + + assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported" + + self.inside_no_sync_ctxt = True + try: + yield + finally: + self.inside_no_sync_ctxt = False + @instrument_w_nvtx - def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True): + def backward(self, loss, release_loss=False, retain_graph=False, scale_wrt_gas=True): r"""Execute backward pass on the loss Arguments: loss: Torch tensor on which to execute backward propagation - allreduce_gradients: is deprecated, ignored, and will soon be removed' retain_graph: bool, default: false forward on user defined choice of retain_graph """ @@ -1988,11 +2017,10 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr if self.scale_wrt_gas is not None: scale_wrt_gas = self.scale_wrt_gas - if not allreduce_gradients: - logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed") + do_gradient_reduction = self.enable_backward_allreduce and not self.inside_no_sync_ctxt - # scale loss w.r.t. gradient accumulation if needed - if self.gradient_accumulation_steps() > 1 and scale_wrt_gas: + # scale loss w.r.t. gradient accumulation if reduction is not disabled + if do_gradient_reduction and self.gradient_accumulation_steps() > 1 and scale_wrt_gas: loss = self._scale_loss_by_gas(loss.float()) # Log training loss @@ -2041,7 +2069,7 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr self._start_timers(self.engine_timers.backward_reduce_timers) - if allreduce_gradients and self.enable_backward_allreduce: + if do_gradient_reduction: # Traditional code path that allreduces the module parameter grads self.allreduce_gradients() @@ -2177,6 +2205,9 @@ def step(self, lr_kwargs=None): r"""Execute the weight update step after forward and backward propagation on effective_train_batch. """ + assert not self.inside_no_sync_ctxt, \ + "It is illegal to call Engine.step() inside no_sync context manager" + see_memory_usage("Engine before step", force=self.memory_breakdown()) # Check early because self.global_steps is incremented at some point here. diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 7ebf5487cf9ed..deb44c2e71eb9 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -287,7 +287,8 @@ def _exec_reduce_tied_grads(self): weight_group_list = self.module.get_tied_weights_and_groups() for weight, group in weight_group_list: grad = weight._hp_grad if self.using_bf16_optimizer else weight.grad - dist.all_reduce(grad, group=group) + if grad is not None: + dist.all_reduce(grad, group=group) def _exec_reduce_grads(self): self._force_grad_boundary = True @@ -639,9 +640,10 @@ def _aggregate_total_loss(self): self.dp_group_loss = losses[0].clone().detach() agg_loss = losses[1].clone().detach() if additional_losses is not None: - self.agg_additional_losses = OrderedDict( - {name: losses[2 + i].clone().detach() - for i, name in enumerate(additional_losses.keys())}) + self.agg_additional_losses = OrderedDict({ + name: losses[2 + i].clone().detach() + for i, name in enumerate(additional_losses.keys()) + }) return agg_loss def set_dataloader(self, loader): diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py index 90b2d9b8bd31e..1f9825c34638b 100644 --- a/deepspeed/runtime/swap_tensor/utils.py +++ b/deepspeed/runtime/swap_tensor/utils.py @@ -18,12 +18,12 @@ def swap_in_tensors(swap_handle, tensor_buffers, swap_paths): for buffer, path in zip(tensor_buffers, swap_paths): - assert (swap_handle.async_pread(buffer, path) == 0) + assert (swap_handle.async_pread(buffer, path, 0) == 0) def swap_out_tensors(swap_handle, tensor_buffers, swap_paths): for buffer, path in zip(tensor_buffers, swap_paths): - assert (swap_handle.async_pwrite(buffer, path) == 0) + assert (swap_handle.async_pwrite(buffer, path, 0) == 0) def print_object(obj, name, exclude_list=[]): diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index b9617d3e632fd..f48adb58c9bff 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -257,8 +257,8 @@ def has_overflow(self, params, has_moe_params=None): elif self.mpu is not None: if self.deepspeed is not None: using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') - if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or ( - not using_pipeline and self.deepspeed.enable_backward_allreduce is False): + if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce + is False) or (not using_pipeline and self.deepspeed.enable_backward_allreduce is False): dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group()) dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False: diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index 1cfcd784e2ceb..19b272ce9e92b 100644 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -21,6 +21,7 @@ "stage3_max_live_parameters" : 1000000000, "stage3_max_reuse_distance" : 1000000000, "stage3_use_all_reduce_for_fetch_params": [true|false], + "stage3_module_granularity_threshold": 0, "allgather_partitions": [true|false], "use_multi_rank_bucket_allreduce": [true|false], "allgather_bucket_size": 500000000, @@ -245,6 +246,14 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): this option is enabled and then saves the fp16 model weights. """ + module_granularity_threshold: int = Field(pp_int(0), alias="stage3_module_granularity_threshold") + """ + The granularity of a module is determined by the ratio of "parameter_count / (1 + descendant count)". + ZeRO3 classifies modules with a granularity below the threshold as fine-grained, + which are treated as integral units during parameter fetching. This reduces host overhead + and the separate allgather overhead introduced by hooks for fine-grained layers when fetching parameters. + """ + use_all_reduce_for_fetch_params: bool = Field(False, alias="stage3_use_all_reduce_for_fetch_params") """ Use all_reduce op when fetching module parameters at stage3. This improves performance by reducing @@ -302,7 +311,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): for efficient all_2_all_reduce comm """ - mics_shard_size: int = Field(-1, new_param="mics_shard_size") + mics_shard_size: int = Field(-1, json_schema_extra={"new_param": "mics_shard_size"}) mics_hierarchical_params_gather: bool = False diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py index 1ce2414a1e17c..f945f51661900 100644 --- a/deepspeed/runtime/zero/parameter_offload.py +++ b/deepspeed/runtime/zero/parameter_offload.py @@ -6,7 +6,7 @@ import sys import torch from collections import OrderedDict -from deepspeed.utils import z3_leaf_module +from deepspeed.utils import z3_leaf_module, set_z3_leaf_module from deepspeed.runtime.utils import see_memory_usage from deepspeed.runtime.zero.utils import apply_to_tensors_only, is_zero_param from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum @@ -14,6 +14,7 @@ from deepspeed.runtime.zero.partition_parameters import * from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, InflightParamRegistry, iter_params from deepspeed.accelerator import get_accelerator +from deepspeed import utils FWD_MODULE_STACK = list() @@ -38,7 +39,7 @@ def _apply_forward_and_backward_to_tensors_only(module, forward_function, backwa class ZeROOrderedDict(OrderedDict): - def __init__(self, parent_module=None, *args, **kwargs): + def __init__(self, parent_module, *args, **kwargs): """A replacement for ``collections.OrderedDict`` to detect external ZeRO params. Args: @@ -49,6 +50,10 @@ def __init__(self, parent_module=None, *args, **kwargs): self._parent_module = parent_module self._in_forward = False + def __reduce__(self): + r0, _, *r2 = super().__reduce__() + return (r0, (self._parent_module, )) + r2 + def __getitem__(self, key): param = super().__getitem__(key) @@ -56,6 +61,7 @@ def __getitem__(self, key): if param is None: return param + # TODO: only weaken this check during compilation if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE: if self._parent_module._parameters._in_forward: register_external_parameter(FWD_MODULE_STACK[-1], param) @@ -96,6 +102,7 @@ def __init__( zero_param_parallel_group=None, zero_quantized_weights=False, zero_quantized_nontrainable_weights=False, + zero_module_granularity_threshold=0, ): see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True) @@ -128,7 +135,6 @@ def __init__( self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold, self.model_persistence_threshold) - self.param_coordinators = {} self._prefetch_bucket_sz = int(prefetch_bucket_size) self._max_reuse_distance_in_numel = int(max_reuse_distance) self._max_available_parameters_in_numel = int(max_live_parameters) @@ -136,14 +142,31 @@ def __init__( ) if overlap_comm else get_accelerator().default_stream() if not hasattr(module, "ds_inflight_param_registry"): - module.ds_inflight_param_registry = dict() - # we need two registries, one for training and one for eval. They will be used when creating PartitionedParameterCoordinator - module.ds_inflight_param_registry[True] = InflightParamRegistry() - module.ds_inflight_param_registry[False] = InflightParamRegistry() + module.ds_inflight_param_registry = InflightParamRegistry() self.__inflight_param_registry = module.ds_inflight_param_registry + self.param_coordinator = PartitionedParameterCoordinator( + prefetch_bucket_sz=self._prefetch_bucket_sz, + max_reuse_distance_in_numel=self._max_reuse_distance_in_numel, + max_available_parameters_in_numel=self._max_available_parameters_in_numel, + allgather_stream=self.__allgather_stream, + inflight_param_registry=self.__inflight_param_registry, + prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme, + timers=self.timers, + zero_quantized_weights=self.zero_quantized_weights, + zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights, + ) + + if zero_module_granularity_threshold > 0: + self.min_granularity_value = sys.maxsize + self.min_granularity_layer = None + self.granularity_info = set() + self.z3_leaf_layers = [] + self._set_z3_leaf_modules_by_threshold(module, zero_module_granularity_threshold) + self.forward_hooks = [] self.backward_hooks = [] + self.setup_zero_stage3_hooks() print_rank_0( f'Created module hooks: forward = {len(self.forward_hooks)}, backward = {len(self.backward_hooks)}', @@ -156,26 +179,13 @@ def partition_all_parameters(self): """Partitioning Parameters that were not partitioned usually if parameters of modules whose input parameters do not require grad computation do not trigger post call and will therefore will remain unpartitioned""" - self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module) + self.get_param_coordinator().release_and_reset_all(self.module) for param in iter_params(self.module, recurse=True): if param.ds_status != ZeroParamStatus.NOT_AVAILABLE: raise RuntimeError(f"{param.ds_summary()} expected to be released") - def get_param_coordinator(self, training): - if not training in self.param_coordinators: - self.param_coordinators[training] = PartitionedParameterCoordinator( - prefetch_bucket_sz=self._prefetch_bucket_sz, - max_reuse_distance_in_numel=self._max_reuse_distance_in_numel, - max_available_parameters_in_numel=self._max_available_parameters_in_numel, - allgather_stream=self.__allgather_stream, - inflight_param_registry=self.__inflight_param_registry[training], - prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme, - timers=self.timers, - zero_quantized_weights=self.zero_quantized_weights, - zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights, - ) - - return self.param_coordinators[training] + def get_param_coordinator(self): + return self.param_coordinator def empty_partition_cache(self): self.partition_all_parameters() @@ -223,14 +233,14 @@ def setup_zero_stage3_hooks(self): #reset step if in inference mode @instrument_w_nvtx - def _end_of_forward_hook(module, *args): + def _start_of_forward_hook(module, *args): + + self.get_param_coordinator().reset_step() - if not torch._C.is_grad_enabled(): - self.get_param_coordinator(training=False).reset_step() + self.module.register_forward_pre_hook(_start_of_forward_hook) #likely one of them should be enough but just to be safe self._register_hooks_recursively(self.module) - self.module.register_forward_hook(_end_of_forward_hook) # Add top module to stack trace global FWD_MODULE_STACK @@ -382,7 +392,8 @@ def _run_before_forward_function(input): _run_after_backward_hook, inputs) def _post_backward_module_hook(module, inputs): - module.ds_grads_remaining = 0 + if not hasattr(module, "ds_grads_remaining"): + module.ds_grads_remaining = 0 if not hasattr(module, "post_bwd_fn"): @@ -442,7 +453,7 @@ def pre_sub_module_forward_function(self, sub_module): global FWD_MODULE_STACK FWD_MODULE_STACK.append(sub_module) - param_coordinator = self.get_param_coordinator(training=sub_module.training) + param_coordinator = self.get_param_coordinator() param_coordinator.trace_prologue(sub_module) if param_coordinator.is_record_trace(): param_coordinator.record_module(sub_module) @@ -455,7 +466,7 @@ def post_sub_module_forward_function(self, sub_module): see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release", force=False) - param_coordinator = self.get_param_coordinator(training=sub_module.training) + param_coordinator = self.get_param_coordinator() param_coordinator.release_sub_module(sub_module) see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} after release", @@ -463,8 +474,8 @@ def post_sub_module_forward_function(self, sub_module): @torch.no_grad() def pre_sub_module_backward_function(self, sub_module): - assert sub_module.training, "backward pass is invalid for module in evaluation mode" - param_coordinator = self.get_param_coordinator(training=True) + # assert sub_module.training, "backward pass is invalid for module in evaluation mode" + param_coordinator = self.get_param_coordinator() param_coordinator.trace_prologue(sub_module) if param_coordinator.is_record_trace(): param_coordinator.record_module(sub_module) @@ -472,13 +483,92 @@ def pre_sub_module_backward_function(self, sub_module): @torch.no_grad() def post_sub_module_backward_function(self, sub_module): - assert sub_module.training, "backward pass is invalid for module in evaluation mode" + # assert sub_module.training, "backward pass is invalid for module in evaluation mode" see_memory_usage( f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release", force=False) - self.get_param_coordinator(training=True).release_sub_module(sub_module) + self.get_param_coordinator().release_sub_module(sub_module) see_memory_usage( f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release", force=False) + + def _set_z3_leaf_modules_by_threshold(self, module, zero_module_granularity_threshold): + + self._get_granularity_recursively(module) + print_rank_0(f"{'MODULE NAME'.ljust(30)}|{'GRANULARITY VALUE'.rjust(20)}", force=True) + for granularity in self.granularity_info: + print_rank_0(granularity, force=True) + + if self.min_granularity_value <= zero_module_granularity_threshold: + self._set_leaf_by_threshold_preorder(module, zero_module_granularity_threshold) + utils.logger.info( + f"z3_leaf_module was set by stage3_module_granularity_threshold:{zero_module_granularity_threshold}") + for layer in self.z3_leaf_layers: + print_rank_0(f"{layer.__class__.__name__}:{layer.ds_model_granularity}", force=True) + else: + utils.logger.warning( + f"The smallest module granularity is [{self.min_granularity_layer}:{self.min_granularity_value}]. "\ + f"To make stage3_module_granularity_threshold effective, you need to set stage3_module_granularity_threshold >= {self.min_granularity_value}. "\ + f"Current Value:{zero_module_granularity_threshold}" + ) + + def _get_granularity_recursively(self, module): + """This function is used to recursively obtain the granularity of each module.""" + + # avoid setting as leaf for particularly large models, even if the granularity is very small + # an oversized leaf module increases the number of live parameters, introducing memory overhead + Z3_MAX_LEAF_SIZE = 1e9 + + if not list(module.parameters()): + # skip Modules without parameters, such as GELU, etc. + module.ds_model_granularity = sys.maxsize + return 0, 0 + + num_layers = 0 + num_params = 0 + num_params += sum(p.ds_numel for p in module.parameters(recurse=False)) + if not any(module.children()): + # torch leaf module + module.ds_model_granularity = sys.maxsize + return 1, num_params + + for child in module.children(): + layers_in_child, params_in_child = self._get_granularity_recursively(child) + num_layers += layers_in_child + num_params += params_in_child + + if module.__class__.__name__ in torch.nn.modules.container.__all__: + # Do not set container modules like ModuleList as leaf modules + # as this will prevent hooks from being set on their children + # and they may do not invoke the forward method + module.ds_model_granularity = sys.maxsize + return num_layers, num_params + + num_layers += 1 + ds_model_granularity = (num_params // num_layers) if num_params <= Z3_MAX_LEAF_SIZE else sys.maxsize + module.ds_model_granularity = ds_model_granularity + # module.ds_model_num_layers = num_layers + # module.ds_model_num_params = num_params + if self.min_granularity_value > ds_model_granularity: + self.min_granularity_value = ds_model_granularity + self.min_granularity_layer = module.__class__.__name__ + self.granularity_info.add(f"{module.__class__.__name__.ljust(30)}|{str(ds_model_granularity).rjust(20)}") + + return num_layers, num_params + + def _set_leaf_by_threshold_preorder(self, module, granularity_treshhold): + '''Set modules as leaf modules based on the threshold, prioritizing parent nodes.''' + + num_params = sum(p.ds_numel for p in module.parameters()) + if num_params == 0: + # skip Modules without parameters, such as GELU, etc. + return + if module.ds_model_granularity <= granularity_treshhold: + set_z3_leaf_module(module, True) + self.z3_leaf_layers.append(module) + return + + for sub_module in module.children(): + self._set_leaf_by_threshold_preorder(sub_module, granularity_treshhold) diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 22a6746bb57cf..cb0cd7c8017db 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -1882,6 +1882,7 @@ def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False): return None + @torch.no_grad() def _allgather_params(self, param_list, hierarchy=0): if len(param_list) == 0: return diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py index 5780b2afd6def..596d0e9c20f93 100644 --- a/deepspeed/runtime/zero/partitioned_param_coordinator.py +++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py @@ -18,6 +18,7 @@ from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id from deepspeed.accelerator import get_accelerator import deepspeed.runtime.compiler as compiler +from deepspeed.runtime.compiler import is_compiling import logging @@ -92,7 +93,7 @@ def __init__( # keeps track of the number of submodules invoked so far. self.__step_id: int = 0 # network tracing mode - self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.RECORD + self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.INVALID # sequence of submodules/parameters in forward pass + backward pass self.__submodule_order: Iterable[Module] = [] self.__param_order: Iterable[__class__.__ParamInTrace] = [] @@ -188,6 +189,9 @@ def trace_prologue(self, sub_module: Module) -> None: @compiler.disable def record_module(self, sub_module: Module) -> None: """adds sub module to trace""" + if is_compiling(): + return + if not self.is_record_trace(): raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}") @@ -195,6 +199,8 @@ def record_module(self, sub_module: Module) -> None: self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id) def record_parameters(self, sub_module: Module) -> None: + if is_compiling(): + return """adds sub module to trace""" if not self.is_record_trace(): raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}") @@ -209,8 +215,12 @@ def construct_parameter_trace_from_module_trace(self): for sub_module in self.__submodule_order: self.record_parameters(sub_module) + @compiler.disable def reset_step(self) -> None: """indicate that we have completed one fwd+bwd for the model""" + if is_compiling(): + return + self._clean_inflight_param_registry() if not self.is_complete_trace(): # not self.trace_complete: @@ -242,7 +252,6 @@ def reset_step(self) -> None: self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10)) self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque()) self.__step_id = 0 - self.__n_available_params = 0 self.__profiler.reset_events() def _dump_params(self, tag, sub_module, params, step_id=None): @@ -420,7 +429,7 @@ def release_and_reset_all(self, module: Module) -> None: # there's a hook execution issue param.ds_active_sub_modules.clear() self.__release_param(param) - + self.__n_available_params = 0 for param in iter_params(module, recurse=True): if param.ds_status != ZeroParamStatus.NOT_AVAILABLE: raise RuntimeError(f"{param.ds_summary()} expected to be released") diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index e2c273fd913fb..99a5ecf41a2fc 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -157,6 +157,7 @@ def __init__( zero_hpz_partition_size=1, zero_quantized_weights=False, zero_quantized_nontrainable_weights=False, + zero_module_granularity_threshold=0, ): see_memory_usage("Stage 3 initialize beginning", force=True) @@ -227,7 +228,8 @@ def __init__( mpu=mpu, zero_param_parallel_group=zero_param_parallel_group, zero_quantized_weights=zero_quantized_weights, - zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights) + zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights, + zero_module_granularity_threshold=zero_module_granularity_threshold) self.persistent_parameters = self.parameter_offload.persistent_parameters self._configure_offloading(offload_optimizer_config, offload_param_config) @@ -458,6 +460,7 @@ def initialize_ds_offload( zero_param_parallel_group, zero_quantized_weights, zero_quantized_nontrainable_weights, + zero_module_granularity_threshold, ): return DeepSpeedZeRoOffload(module=module, timers=timers, @@ -473,7 +476,8 @@ def initialize_ds_offload( mpu=mpu, zero_param_parallel_group=zero_param_parallel_group, zero_quantized_weights=zero_quantized_weights, - zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights) + zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights, + zero_module_granularity_threshold=zero_module_granularity_threshold) def _get_trainable_parameter_groups(self): param_groups = [] @@ -538,10 +542,15 @@ def _setup_for_real_optimizer(self): self.grad_partitions_flat_buffer = get_accelerator().pin_memory(self.grad_partitions_flat_buffer) offset = 0 + max_partition_numel = 0 for param in all_params: self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow( 0, offset, param.partition_numel()) offset += param.partition_numel() + max_partition_numel = max(max_partition_numel, param.partition_numel()) + if self.offload_optimizer: + self.pinned_grad_buffer: Tensor = get_accelerator().pin_memory( + torch.empty(max_partition_numel, device=self.device)) def _link_all_hp_params(self): for p in self.module.parameters(): @@ -588,8 +597,8 @@ def defragment(tensors: List[Tensor]) -> Tensor: return device_buffer - def _get_param_coordinator(self, training): - return self.parameter_offload.get_param_coordinator(training) + def _get_param_coordinator(self): + return self.parameter_offload.get_param_coordinator() def _configure_offloading(self, offload_optimizer_config, offload_param_config): ###################### offload optimizer setup ################################## @@ -1498,9 +1507,13 @@ def partition_grads(self, params_to_release: List[Parameter], grad_partitions: L offload_fp32_gradients[i].append(grad_buffer.float()) offload_fp32_offsets[i].append(dest_offset) else: + buffer_numel = grad_buffer.numel() fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow( - 0, dest_offset, grad_buffer.numel()) - fp32_grad_tensor.copy_(grad_buffer) + 0, dest_offset, buffer_numel) + self.pinned_grad_buffer[:buffer_numel].copy_( + grad_buffer.to(dtype=torch.float32, non_blocking=True)) + get_accelerator().synchronize() + fp32_grad_tensor.copy_(self.pinned_grad_buffer[:buffer_numel], non_blocking=True) # free the gradient if not get_accelerator().is_synchronized_device(): @@ -1865,7 +1878,7 @@ def _pre_step(self): see_memory_usage(f"In step before checking overflow", force=False) print_rank_0("Finished Tracing at Beginning of Step") - self._get_param_coordinator(training=True).hierarchy = 0 + self._get_param_coordinator().hierarchy = 0 print_rank_0("Finished Tracing at Beginning of Step") @@ -2249,8 +2262,6 @@ def backward(self, loss, retain_graph=False): else: self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) - self._get_param_coordinator(training=True).reset_step() - if self.swap_optimizer: self.optimizer_swapper.post_backward() @@ -2730,7 +2741,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, pa assert os.path.isfile( optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.' - optim_sd = torch.load(optim_state_path) + optim_sd = torch.load(optim_state_path, weights_only=False) self._load_global_state_stage3(optim_sd) key_list = ["fp32", "exp_avg", "exp_avg_sq"] @@ -2788,7 +2799,7 @@ def load_hp_checkpoint_state(self, folder, key): local_rank = dist.get_local_rank() # Load tensors from files and reshape them to flat vectors - loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt")).view(-1) + loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False).view(-1) # Partition the loaded data according to the local rank world_size = dist.get_world_size(group=self.dp_process_group) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index df7a2f83e3bcc..7ac89a2338087 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1070,14 +1070,10 @@ def average_tensor(self, tensor): for i, param, param_id in self.params_in_ipg_bucket: process_group = self.dp_process_group - grad_reduc = self.get_gradient_for_reduction(param) - #Averages gradients at parameter level if ipg has a moe param - #Otherwise averaging is done at the entire buffer level at the end of the loop - # MoE param have different groups + if self.ipg_bucket_has_moe_params: process_group = self.expert_dp_process_group[param.group_name] if is_moe_param( param) else self.dp_process_group - grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size)) partition_ids = self.param_to_partition_ids[i][param_id] assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids @@ -1116,8 +1112,7 @@ def average_tensor(self, tensor): curr_size += numel prev_id, prev_process_group = partition_id, process_group - if not self.ipg_bucket_has_moe_params: - tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size)) + tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size)) buckets = {} for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets): @@ -2302,11 +2297,6 @@ def load_state_dict(self, def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights): self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder) - @property - def param_groups(self): - """Forward the wrapped optimizer's parameters.""" - return self.optimizer.param_groups - def _load_global_state(self, sd): self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler) self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale) diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py index e809fe1118b51..8fd3ca9184330 100644 --- a/deepspeed/sequence/layer.py +++ b/deepspeed/sequence/layer.py @@ -10,6 +10,8 @@ import deepspeed.comm as dist from deepspeed.accelerator import get_accelerator +from deepspeed.module_inject.tp_shard import get_shard_size_list, set_num_kv_heads, get_num_kv_heads +from deepspeed.utils import groups def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len, num_head, head_dim): @@ -38,8 +40,132 @@ def post_func(input): return post_func +def uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group): + seq_world_size = dist.get_world_size(group) + inp_shape = list(input.shape) + assert batch_dim_idx in [0, 1], "batch_dim_idx must be either 0 or 1" + + if not (scatter_idx < 2): + input_splits = get_shard_size_list(inp_shape[scatter_idx], seq_world_size) + input = input.transpose(0, scatter_idx).contiguous() + local_heads = input_splits[groups._get_sequence_parallel_rank()] + output_splits = [local_heads] * seq_world_size + + output_buffer_shape = [seq_world_size * local_heads] + list(input.shape[1:]) + output = torch.empty(output_buffer_shape, device=input.device, dtype=input.dtype) + dist.all_to_all_single(output,input,output_split_sizes=output_splits,\ + input_split_sizes=input_splits,group=group) + ###[seq_ws*local_heads, ...] to [seq_ws, local_heads, ...] + output = output.view(seq_world_size, local_heads, *output.shape[1:]) + ###[seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...] + + ### batch_dim_idx=0 [seq_ws,local_heads,seq_len,b,...] to [b, seq_ws, seq_len, local_heads ...] + ### batch_dim_idx=1 [seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...] + if batch_dim_idx == 0: + order = [3, 0, 2, 1] + list(range(4, len(output.shape))) + output = output.permute(order).contiguous() + ###[b, seq_ws*local_seq_len, local_heads,...] + output = output.view(output.shape[0], inp_shape[gather_idx] * seq_world_size, + *output.shape[3:]).contiguous() + elif batch_dim_idx == 1: + output = output.transpose(1, 3).contiguous() + ###[seq_ws*local_seq_len, b, local_heads,...] + output = output.view(inp_shape[gather_idx] * seq_world_size, *output.shape[2:]).contiguous() + else: + # The compatibility handling of 4D and 3D tensors, standardizing to 3D. + input = input.reshape(input.shape[0], input.shape[1], -1) + + if batch_dim_idx == 0: #b,s,h + input = input.permute(1, 2, 0).contiguous() #s,h,b + elif batch_dim_idx == 1: #s,b,h + input = input.transpose(1, 2).contiguous() #s,h,b + seq_len, h, batch_size = input.shape + num_local_heads_list = get_shard_size_list(get_num_kv_heads(), seq_world_size) + local_heads = num_local_heads_list[groups._get_sequence_parallel_rank()] + h_dim = h // local_heads + local_seq_len = seq_len // seq_world_size + + input = input.view(seq_len * h, batch_size) + local_seq_len_with_heads = int(input.shape[0] / seq_world_size) # dim size of local_seq_len*local_heads*hdim + input_splits = [local_seq_len_with_heads] * seq_world_size + coeff = local_seq_len_with_heads // local_heads #per head: dim size of local_seq_len*hdim + + #uneven seq_world_size coeff, total_heads/local_heads. + heads_scale_coeff = get_num_kv_heads() / local_heads + + output_splits = [num_local_heads * coeff for num_local_heads in num_local_heads_list] + output_buff_d1_size = int(heads_scale_coeff * local_seq_len_with_heads) + total_h = int(inp_shape[gather_idx] * heads_scale_coeff) + output = torch.empty(output_buff_d1_size, input.shape[1], device=input.device, dtype=input.dtype) + dist.all_to_all_single(output,input,output_split_sizes=output_splits, \ + input_split_sizes=input_splits,group=group) + ################## + #suppose 7 heads divide into 4 ranks [2,2,2,1] + #chunk_num_heads_small=floor(7/4)=1 + #chunk_num_heads_large=ceil(7/4)=2 + #num_chunk_heads_large=len([2,2,2])=3, all2all_buffer_counts + #num_chunk_heads_small=len([1])=1, all2all_buffer_counts + #total_num_large_heads=sum([2,2,2])=7 + #total_num_small_heads=sum([1])=1 + + chunk_num_heads_small = get_num_kv_heads() // seq_world_size # even heads compatible + chunk_num_heads_large = chunk_num_heads_small + 1 + num_chunk_heads_large = get_num_kv_heads() % seq_world_size + num_chunk_heads_small = seq_world_size - num_chunk_heads_large + total_num_large_heads = num_chunk_heads_large * chunk_num_heads_large + total_num_small_heads = num_chunk_heads_small * chunk_num_heads_small + + heads_large_combine_size = coeff * total_num_large_heads + heads_small_combine_size = coeff * total_num_small_heads + heads_large_chunk, heads_small_chunk = output.split([heads_large_combine_size, heads_small_combine_size], + dim=0) + heads_large_chunk = heads_large_chunk.view(num_chunk_heads_large, local_seq_len, chunk_num_heads_large, h_dim, + batch_size) + heads_small_chunk = heads_small_chunk.view(num_chunk_heads_small, local_seq_len, chunk_num_heads_small, h_dim, + batch_size) + if batch_dim_idx == 0: + #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[batch,local_seq_len,all2all_buffer_counts*n_heads,dim] + order = [4, 1, 0, 2, 3] + heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(batch_size, local_seq_len, + total_num_large_heads, h_dim) + heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(batch_size, local_seq_len, + total_num_small_heads, h_dim) + elif batch_dim_idx == 1: + #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[local_seq_len,batch,all2all_buffer_counts*n_heads,dim] + order = [1, 4, 0, 2, 3] + heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(local_seq_len, batch_size, + total_num_large_heads, h_dim) + heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(local_seq_len, batch_size, + total_num_small_heads, h_dim) + + output = torch.cat([heads_large_chunk, heads_small_chunk], dim=2).contiguous() + + inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size + output_shape= inp_shape[: gather_idx] + \ + [total_h,] + \ + inp_shape[gather_idx + 1:] + + output = output.view(output_shape) + + return output + + def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None): seq_world_size = dist.get_world_size(group) + # we only need num_heads once + num_heads = input.shape[2] + + if get_num_kv_heads() is not None or num_heads % seq_world_size != 0: + # Assuming here that the number of heads for q is consistent with kv + # If not, additional logic is required for cases like GQA + if get_num_kv_heads() is None: + assert num_heads > seq_world_size, f"Number of heads ({num_heads}) must be larger than sequence parallel size ({seq_world_size})" + # set heads at first call by num_total_heads. + # then use ``get_num_kv_heads() is not None`` to re-entry uneven path. + set_num_kv_heads(num_heads) + assert async_op == False, "uneven head sp does not support async op" + return uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group) + if batch_dim_idx == 0: # b, s, n, h if scatter_idx < 2: diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py index c6a202d485eba..983e64642c696 100644 --- a/deepspeed/utils/__init__.py +++ b/deepspeed/utils/__init__.py @@ -16,7 +16,7 @@ from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state, safe_set_full_grad from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state -from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter +from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter, set_z3_leaf_module from .mixed_precision_linkage import link_hp_params, lazy_init_hp_params_optimizer_state from deepspeed.runtime.dataloader import RepeatingLoader from .numa import get_numactl_cmd diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index 9dd288ef46dbb..e9550a0ec25ad 100755 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -484,6 +484,8 @@ def _get_sequence_parallel_rank(): global mpu if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'): return mpu.get_sequence_parallel_rank() + if mesh_device is not None: + return dist.get_rank(mesh_device.get_group(mesh_dim="sequence_parallel")) return 0 diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py index 55fb72f6c9fba..d5db29485db39 100644 --- a/deepspeed/utils/logging.py +++ b/deepspeed/utils/logging.py @@ -7,8 +7,7 @@ import logging import sys import os -import torch -from deepspeed.runtime.compiler import is_compile_supported +from deepspeed.runtime.compiler import is_compile_supported, is_compiling log_levels = { "debug": logging.DEBUG, @@ -26,7 +25,7 @@ def create_warning_filter(logger): def warn_once(record): nonlocal warn - if is_compile_supported() and torch.compiler.is_compiling() and not warn: + if is_compile_supported() and is_compiling() and not warn: warn = True logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to" " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1") @@ -39,7 +38,7 @@ def logging_decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): - if torch.compiler.is_compiling(): + if is_compiling(): return else: return func(*args, **kwargs) diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py index 4fe7cbba90aeb..aba3b5179d418 100644 --- a/deepspeed/utils/numa.py +++ b/deepspeed/utils/numa.py @@ -23,7 +23,10 @@ # ] def get_numa_cores(): ret = [] - output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8") + try: + output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8") + except: + return [] lines = output.split('\n') for line in lines: if line.startswith('available:'): diff --git a/deepspeed/utils/z3_leaf_module.py b/deepspeed/utils/z3_leaf_module.py index 47d9ff698f1f0..14e8ae2d28235 100644 --- a/deepspeed/utils/z3_leaf_module.py +++ b/deepspeed/utils/z3_leaf_module.py @@ -4,7 +4,7 @@ # DeepSpeed Team import torch -from typing import List, Type +from typing import List, Type, Union def z3_leaf_module(model: torch.nn.Module) -> bool: @@ -40,18 +40,24 @@ def get_z3_leaf_modules(model: torch.nn.Module) -> List[torch.nn.Module]: return [module for module in model.modules() if z3_leaf_module(module)] -def _do_set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type], +def set_z3_leaf_module(model: torch.nn.Module, flag: bool): + model._z3_leaf = flag + + +def _do_set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type], List[str]], flag: bool) -> List[torch.nn.Module]: - assert all(isinstance(module_class, type) for module_class in leaf_module_classes), \ - f'leaf_module_classes must be a list of types, got {leaf_module_classes}' + assert all(isinstance(module_class, (type, str) ) for module_class in leaf_module_classes), \ + f'leaf_module_classes must be a list of types or names, got {leaf_module_classes}' leaf_modules = [] def _set_z3_leaf_flag(model: torch.nn.Module): nonlocal leaf_modules - if model.__class__ in leaf_module_classes: - model._z3_leaf = flag - leaf_modules.append(model) + for module in leaf_module_classes: + if (isinstance(module, type) and model.__class__ == module) or \ + (isinstance(module, str) and model.__class__.__name__ == module): + model._z3_leaf = flag + leaf_modules.append(model) model.apply(_set_z3_leaf_flag) @@ -61,13 +67,14 @@ def _set_z3_leaf_flag(model: torch.nn.Module): return leaf_modules -def set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type]) -> List[torch.nn.Module]: +def set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type], + List[str]]) -> List[torch.nn.Module]: """Sets a flag within a module in `model` to instruct ZeRO3 to stop setting hooks recursively when it encounters a module class listed in `leaf_module_classes`. This is particularly useful in the context of Mixture of Experts (MoE) models. In MoE models, the computation order of experts varies across forward passes. This variability can disrupt ZeRO3's functionality, as ZeRO3 relies on tracking the computation order of modules to prefetch parameters efficiently. By designating a module as a 'leaf' node, ZeRO3 will prefetch parameters for all child modules upon entering the module. Another scenario where this functionality is beneficial is in models with excessively fine-grained nested modules, where it helps to avoid the overhead associated with hooks. Args: model (torch.nn.Module): The model to which the leaf module flag will be applied. - leaf_module_classes (List[Type]): A list of module classes that should be flagged as 'leaf' modules. + leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules. Returns: List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`. """ @@ -79,7 +86,7 @@ def unset_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type See `set_z3_leaf_modules` for more details. Args: model (torch.nn.Module): The model to which the leaf module flag will be applied. - leaf_module_classes (List[Type]): A list of module classes that should be flagged as 'leaf' modules. + leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules. Returns: List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`. """ diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py index e69ecd9acb5a2..e93cb1c95f15c 100755 --- a/deepspeed/utils/zero_to_fp32.py +++ b/deepspeed/utils/zero_to_fp32.py @@ -21,7 +21,9 @@ import math import os import re +import gc import json +import numpy as np from tqdm import tqdm from collections import OrderedDict from dataclasses import dataclass @@ -100,7 +102,7 @@ def get_model_state_files(checkpoint_dir): def parse_model_states(files): zero_model_states = [] for file in files: - state_dict = torch.load(file, map_location=device) + state_dict = torch.load(file, map_location=device, weights_only=False) if BUFFER_NAMES not in state_dict: raise ValueError(f"{file} is not a model state checkpoint") @@ -146,8 +148,8 @@ def parse_model_states(files): def parse_optim_states(files, ds_checkpoint_dir): total_files = len(files) state_dicts = [] - for f in files: - state_dict = torch.load(f, map_location=device) + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights # and also handle the case where it was already removed by another helper script state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) @@ -179,19 +181,7 @@ def parse_optim_states(files, ds_checkpoint_dir): else: raise ValueError(f"unknown zero stage {zero_stage}") - if zero_stage <= 2: - fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] - elif zero_stage == 3: - # if there is more than one param group, there will be multiple flattened tensors - one - # flattened tensor per group - for simplicity merge them into a single tensor - # - # XXX: could make the script more memory efficient for when there are multiple groups - it - # will require matching the sub-lists of param_shapes for each param group flattened tensor - - fp32_flat_groups = [ - torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) - ] - + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] return zero_stage, world_size, fp32_flat_groups @@ -398,9 +388,56 @@ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): param_shapes = zero_model_states[0].param_shapes - avail_numel = fp32_flat_groups[0].numel() * world_size + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each # param, re-consolidating each param, while dealing with padding if any @@ -424,7 +461,8 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero offset = 0 total_numel = 0 total_params = 0 - for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'): + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): unpartitioned_numel = shape.numel() total_numel += unpartitioned_numel total_params += 1 @@ -435,10 +473,9 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" ) - # XXX: memory usage doubles here - state_dict[name] = torch.cat( - tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), - 0).narrow(0, 0, unpartitioned_numel).view(shape) + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor offset += partitioned_numel offset *= world_size @@ -473,7 +510,29 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer return state_dict -def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: + shared_tensor = state_dict[converted_tensors[tensor_id]] + state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + state_dict[name] = tensor.contiguous() + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): """ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example @@ -483,14 +542,12 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f - ``checkpoint_dir``: path to the desired checkpoint folder - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` Returns: - pytorch ``state_dict`` - Note: this approach may not work if your application doesn't have sufficient free CPU memory and - you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with - the checkpoint. - A typical usage might be :: from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint @@ -506,6 +563,16 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use """ if tag is None: latest_path = os.path.join(checkpoint_dir, 'latest') @@ -520,7 +587,11 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f if not os.path.isdir(ds_checkpoint_dir): raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") - return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, @@ -541,6 +612,7 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` - ``exclude_frozen_parameters``: exclude frozen parameters """ + # Dependency pre-check if safe_serialization: try: @@ -556,13 +628,18 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, raise # Convert zero checkpoint to state_dict - state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) # Shard the model if it is too big. weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" if max_shard_size is not None: filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") - state_dict_split = split_torch_state_dict_into_shards(state_dict, + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size) else: @@ -571,15 +648,22 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, state_dict_split = StateDictSplit(is_sharded=False, filename_to_tensors={weights_name: list(state_dict.keys())}) - # Save the model + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) filename_to_tensors = state_dict_split.filename_to_tensors.items() for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): - shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) output_path = os.path.join(output_dir, shard_file) if safe_serialization: - save_file(shard, output_path, metadata={"format": "pt"}) + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) else: - torch.save(shard, output_path) + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in shard_state_dict: + del state_dict[tensor_name] + del shard_state_dict + gc.collect() # Save index if sharded if state_dict_split.is_sharded: diff --git a/docker/gh-builder/Dockerfile.py311 b/docker/gh-builder/Dockerfile.py311 new file mode 100644 index 0000000000000..603fb614314fc --- /dev/null +++ b/docker/gh-builder/Dockerfile.py311 @@ -0,0 +1,35 @@ +# Start with NGC container +FROM nvcr.io/nvidia/pytorch:24.03-py3 + +# Set noninteractive mode for apt-get +ARG DEBIAN_FRONTEND=noninteractive + +# Install necessary dependencies for building Python +RUN apt-get update && apt-get install -y \ + wget \ + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + curl \ + libncursesw5-dev \ + libgdbm-dev \ + libc6-dev \ + libffi-dev \ + tk-dev \ + && rm -rf /var/lib/apt/lists/* + +# Download and install Python 3.11 +RUN wget https://www.python.org/ftp/python/3.11.9/Python-3.11.9.tgz \ + && tar xzf Python-3.11.9.tgz \ + && cd Python-3.11.9 \ + && ./configure --enable-optimizations \ + && make altinstall \ + && cd .. \ + && rm -rf Python-3.11.9 Python-3.11.9.tgz + +# Set Python 3.11 as the default Python version +RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.11 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.11 1 diff --git a/docker/gh-builder/Dockerfile.py312 b/docker/gh-builder/Dockerfile.py312 new file mode 100644 index 0000000000000..a0a7193201d47 --- /dev/null +++ b/docker/gh-builder/Dockerfile.py312 @@ -0,0 +1,35 @@ +# Start with NGC container +FROM nvcr.io/nvidia/pytorch:24.03-py3 + +# Set noninteractive mode for apt-get +ARG DEBIAN_FRONTEND=noninteractive + +# Install necessary dependencies for building Python +RUN apt-get update && apt-get install -y \ + wget \ + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + curl \ + libncursesw5-dev \ + libgdbm-dev \ + libc6-dev \ + libffi-dev \ + tk-dev \ + && rm -rf /var/lib/apt/lists/* + +# Download and install Python 3.12 +RUN wget https://www.python.org/ftp/python/3.12.5/Python-3.12.5.tgz \ + && tar xzf Python-3.12.5.tgz \ + && cd Python-3.12.5 \ + && ./configure --enable-optimizations \ + && make altinstall \ + && cd .. \ + && rm -rf Python-3.12.5 Python-3.12.5.tgz + +# Set Python 3.12 as the default Python version +RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.12 1 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.12 1 diff --git a/docker/Dockerfile.rocm b/docker/rocm/Dockerfile similarity index 100% rename from docker/Dockerfile.rocm rename to docker/rocm/Dockerfile diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md index adb2f1679ea0e..51e3bbd6eaaa2 100755 --- a/docs/_pages/config-json.md +++ b/docs/_pages/config-json.md @@ -489,6 +489,11 @@ Enabling and configuring ZeRO memory optimizations |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- | | Consolidate the weights before saving the model by `save_16bit_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` | +***stage3_module_granularity_threshold***: [integer] +| Description | Default | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- | +| The granularity of a module is determined by the ratio of `parameter_count` / `(1 + descendant_count)`. ZeRO3 classifies modules with a granularity below the threshold as fine-grained, treating them as integral units during parameter fetching. This reduces host and communication overhead from separate hooks. | `0` | + ***zero_hpz_partition_size***: [integer] | Description | Default | diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md index 3014be18d6820..f833acebde9af 100755 --- a/docs/_tutorials/bert-finetuning.md +++ b/docs/_tutorials/bert-finetuning.md @@ -10,14 +10,14 @@ In this tutorial we will be adding DeepSpeed to the BingBert model for the SQuAD If you don't already have a copy of the DeepSpeed repository, please clone in now and checkout the DeepSpeedExamples submodule the contains the BingBertSquad -example (DeepSpeedExamples/BingBertSquad) we will be going over in the rest of +example (DeepSpeedExamples/training/BingBertSquad) we will be going over in the rest of this tutorial. ```shell git clone https://github.com/microsoft/DeepSpeed cd DeepSpeed git submodule update --init --recursive -cd DeepSpeedExamples/BingBertSquad +cd DeepSpeedExamples/training/BingBertSquad ``` ### Pre-requisites diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md index ce9e3ee9a8922..36dcdf41d9d08 100644 --- a/docs/_tutorials/getting-started.md +++ b/docs/_tutorials/getting-started.md @@ -9,7 +9,7 @@ tags: getting-started * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/). * To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/deepspeed) -* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/main_classes/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed). +* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed). * DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`. * DeepSpeed also supports Intel Xeon CPU, Intel Data Center Max Series XPU, Intel Gaudi HPU, Huawei Ascend NPU etc, please refer to the [accelerator setup guide](/tutorials/accelerator-setup-guide/) diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md index b1a8b53697610..e66bba3f818b2 100644 --- a/docs/_tutorials/onebit-adam.md +++ b/docs/_tutorials/onebit-adam.md @@ -136,7 +136,7 @@ You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [Hug ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam -We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun. +We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.