From 94a79d4d79b2a1a189c807529cf15892b4284dcf Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Mar 2024 13:56:40 -0800 Subject: [PATCH 1/8] Test CI for Gaudi2 --- .github/workflows/gaudi2.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/gaudi2.yml diff --git a/.github/workflows/gaudi2.yml b/.github/workflows/gaudi2.yml new file mode 100644 index 000000000000..746beab3ce42 --- /dev/null +++ b/.github/workflows/gaudi2.yml @@ -0,0 +1,32 @@ +name: gaudi2 + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + unit-tests: + # The type of runner that the job will run on + runs-on: [self-hosted, intel, gaudi2] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Python environment + run: | + pip list From 0d59962da5543bc47511b2bae4d3f89a4ed4d6ca Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Mar 2024 14:25:30 -0800 Subject: [PATCH 2/8] Check on nodes with hl-smi --- .github/workflows/gaudi2.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gaudi2.yml b/.github/workflows/gaudi2.yml index 746beab3ce42..cd4b2c6e55fe 100644 --- a/.github/workflows/gaudi2.yml +++ b/.github/workflows/gaudi2.yml @@ -18,15 +18,20 @@ jobs: unit-tests: # The type of runner that the job will run on runs-on: [self-hosted, intel, gaudi2] + container: + image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + ports: + - 80 + options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v3 - - id: setup-venv - uses: ./.github/workflows/setup-venv - - - name: Python environment + - name: Check container state run: | - pip list + ldd --version + hl-smi + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From c1c985ed5bef93d44228d516ce161447b9b4bdde Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Mar 2024 14:29:04 -0800 Subject: [PATCH 3/8] Remove user defined network modes --- .github/workflows/gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gaudi2.yml b/.github/workflows/gaudi2.yml index cd4b2c6e55fe..8a6489113b3f 100644 --- a/.github/workflows/gaudi2.yml +++ b/.github/workflows/gaudi2.yml @@ -22,7 +22,7 @@ jobs: image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest ports: - 80 - options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host + options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice # Steps represent a sequence of tasks that will be executed as part of the job steps: From cb3bc32c98dad11b079d65bf890ff82457601598 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 12 Mar 2024 13:15:16 -0700 Subject: [PATCH 4/8] Rename yml file --- .github/workflows/{gaudi2.yml => hpu-gaudi2.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{gaudi2.yml => hpu-gaudi2.yml} (98%) diff --git a/.github/workflows/gaudi2.yml b/.github/workflows/hpu-gaudi2.yml similarity index 98% rename from .github/workflows/gaudi2.yml rename to .github/workflows/hpu-gaudi2.yml index 8a6489113b3f..209c38a63823 100644 --- a/.github/workflows/gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -1,4 +1,4 @@ -name: gaudi2 +name: hpu-gaudi2 on: workflow_dispatch: From 16a8c18ffd345a2329e6a72e7d7b3d46ee715d49 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 12 Mar 2024 13:19:30 -0700 Subject: [PATCH 5/8] Enable unit tests --- tests/unit/util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/util.py b/tests/unit/util.py index 75c3000bd4a2..3918f08b3f01 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -47,11 +47,14 @@ def bf16_required_version_check(accelerator_check=True): cuda_version_available = CUDA_MAJOR >= 11 nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' + hpu_available = get_Accelerator().device_name() == 'hpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True elif npu_available: return True + elif hpu_available: + return True else: return False From 53ecd23a24f76f0cd57b2a8ed0fcb1ad5fc3bf95 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 12 Mar 2024 13:23:20 -0700 Subject: [PATCH 6/8] Fix formatting check --- tests/unit/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/util.py b/tests/unit/util.py index 3918f08b3f01..e8e0f476371b 100644 --- a/tests/unit/util.py +++ b/tests/unit/util.py @@ -47,7 +47,7 @@ def bf16_required_version_check(accelerator_check=True): cuda_version_available = CUDA_MAJOR >= 11 nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10) npu_available = get_accelerator().device_name() == 'npu' - hpu_available = get_Accelerator().device_name() == 'hpu' + hpu_available = get_accelerator().device_name() == 'hpu' if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass: return True From 464936eeabe2ae59217ab5f4a2be747ea7cd405e Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 12 Mar 2024 13:32:41 -0700 Subject: [PATCH 7/8] Add scaffolding of unit tests in HPU --- .github/workflows/hpu-gaudi2.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 209c38a63823..211590e75b79 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -35,3 +35,12 @@ jobs: hl-smi python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + + - name: Install deepspeed + run: | + pip install .[dev] + ds_report + + - name: Python environment + run: | + pip list From f9550a6ab15204fdd8442a72cf966a1893b07c95 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 12 Mar 2024 14:04:45 -0700 Subject: [PATCH 8/8] Add path filters --- .github/workflows/hpu-gaudi2.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml index 211590e75b79..a64a337d50af 100644 --- a/.github/workflows/hpu-gaudi2.yml +++ b/.github/workflows/hpu-gaudi2.yml @@ -5,6 +5,8 @@ on: schedule: - cron: "0 0 * * *" pull_request: + paths: + - ".github/workflows/hpu-gaudi2.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}