Skip to content

Commit

Permalink
Merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams committed Apr 8, 2024
2 parents a9302c3 + 72bd275 commit 2f1eb2f
Show file tree
Hide file tree
Showing 277 changed files with 11,446 additions and 2,210 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: amd-mi200

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
25 changes: 4 additions & 21 deletions .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: cpu-inference

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/cpu-inference.yml'
Expand All @@ -10,7 +11,6 @@ on:
- '!deepspeed/inference/v2/**' # exclude v2 dir
- 'tests/unit/inference/**'
- '!tests/unit/inference/v2/**' # exclude v2 tests dir
workflow_dispatch:
merge_group:
branches: [ master ]
schedule:
Expand Down Expand Up @@ -47,42 +47,26 @@ jobs:
- name: Detect instruction sets on instance
run: |
lscpu
pip install cmake
git clone https://github.com/intel/intel-extension-for-pytorch
cd intel-extension-for-pytorch/tests/cpu/isa
cmake .
make
./cpu_features
- name: Install numactl
run: |
sudo apt-get install -y numactl
- name: Install oneCCL Bindings for PyTorch
- name: Install dependencies
run: |
pip install torch
python -m pip install intel_extension_for_pytorch
# the curl line is for troubleshooting
curl -L https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
pip install py-cpuinfo
# check installed version
pip list |grep \\\<torch\\\>
pip list |grep intel-extension-for-pytorch
pip list |grep oneccl-bind-pt
- name: Install oneCCL
run: |
pip install cmake
git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
mkdir build
cd build
cmake ..
make
make install
#source ./_install/env/setvars.sh
# test whether oneCCL is correctly installed
#mpirun -n 2 ./examples/benchmark/benchmark
make -j install
- name: Install transformers
run: |
Expand All @@ -103,7 +87,6 @@ jobs:
source oneCCL/build/_install/env/setvars.sh
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
# check whether the environment is properly setup
python -c "import torch;import intel_extension_for_pytorch as ipex;import oneccl_bindings_for_pytorch;print('done')"
python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
- name: Unit tests
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-torch-latest-cpu
name: cpu-torch-latest

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand All @@ -26,9 +27,13 @@ jobs:
- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install system packages
run: |
sudo apt-get install -y numactl pdsh
- name: Install pytorch
run: |
pip install torch==1.12.0+cpu torchvision==0.13.0+cpu torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -45,5 +50,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
HF_HOME=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
HF_HOME=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
HF_HOME=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2"
HF_HOME=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2"
1 change: 1 addition & 0 deletions .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Formatting

on:
workflow_dispatch:
pull_request:
branches:
'**'
Expand Down
119 changes: 119 additions & 0 deletions .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
name: hpu-gaudi2

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2.yml"
- "accelerator/hpu_accelerator.py"


concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

env:
PT_HPU_LAZY_MODE: 0
TEST_LIST: |
test_accelerator.py
test_autotuning.py
test_compression.py
test_dist.py
test_elastic.py
(test_intX_quantization.py and test_quantized_linear)
test_ds_arguments.py
test_run.py
test_multinode_runner.py
test_moe_tp.py
test_monitor.py
(test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed))
(test_latest_checkpoint.py and test_missing_latest)
test_reshape_checkpoint.py
test_shared_weights.py
test_sparse.py
test_tag_validation.py
test_pipe_module.py
(test_flops_profiler.py and test_flops_profiler_in_inference)
test_get_optim_files.py
test_groups.py
test_init_on_device.py
test_partition_balanced.py
(test_adamw.py and TestAdamConfigs)
test_coalesced_collectives.py
test_activation_checkpointing_non_reentrant.py
test_activation_checkpointing.py
test_data.py
(test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig))
test_ds_config_model.py
test_mup_optimizers.py
(test_pld.py and test_pld_schedule)
test_runtime_utils.py
test_pipe_schedule.py
test_topology.py
(test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler))
test_csr.py
(test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer))
(test_bf16.py and TestZeroDtypeCocktail)
test_partition.py
test_ignore_unused_parameters.py
test_zero_config.py
test_zero_context_ancestry.py
(test_zero_context.py and not TestSerialContext)
test_zero_dynamic_class.py
test_zero_nesting_init.py
test_zeropp.py
(test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
hl-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}"
pytest --verbose unit/ -k "${TEST_LIST}"
20 changes: 14 additions & 6 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ on:
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
workflow_dispatch:
inputs:
mii_branch:
description: 'DeepSpeed-MII Branch'
required: false
default: 'main'
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
Expand Down Expand Up @@ -45,7 +47,8 @@ jobs:
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning]
python -m pip install pydantic==1.10.11
python -m pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
Expand All @@ -58,7 +61,12 @@ jobs:
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
- name: MII unit tests
run: |
git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
BRANCH="main"
if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
BRANCH="${{ github.event.inputs.mii_branch }}"
fi
echo "Cloning DeepSpeed-MII branch: $BRANCH"
git clone -b $BRANCH --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
cd DeepSpeed-MII
pip install .[dev]
cd tests
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-accelerate-v100

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand All @@ -18,7 +19,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-h100.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nv-h100

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
53 changes: 53 additions & 0 deletions .github/workflows/nv-human-eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: nv-human-eval

on:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Clone Human Eval
run: |
git clone --depth=1 https://github.com/openai/human-eval.git
sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
cd human-eval
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
Loading

0 comments on commit 2f1eb2f

Please sign in to comment.