From cb3571bec7de7752cdbbb027c89fc8391077c4c9 Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Fri, 22 Nov 2024 11:02:57 +0000 Subject: [PATCH 1/5] [AMD] Updated CI docker image to rocm+asan 6.2 and pytorch 2.5.1 --- .github/workflows/integration-tests.yml | 2 +- .github/workflows/integration-tests.yml.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 2922da501efb..3c00a85d9162 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -327,7 +327,7 @@ jobs: runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}} name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}}) container: - image: rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4 + image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root steps: - name: Checkout diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 7de7264272c1..5489945d785d 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -374,7 +374,7 @@ jobs: name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}}) container: - image: rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4 + image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root steps: From d8c6fcd98411bcb9ac6b3ad74f89c3a92ab0cfaf Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Tue, 5 Nov 2024 15:07:21 +0000 Subject: [PATCH 2/5] Fix unhandled profile event in RoctracerProfiler --- .../csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp index adc908d2cd96..ca93678e1c82 100644 --- a/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp +++ b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp @@ -74,6 +74,7 @@ std::shared_ptr convertActivityToMetric(const roctracer_record_t *activity) { std::shared_ptr metric; switch (activity->kind) { + case kHipVdiCommandTask: case kHipVdiCommandKernel: { if (activity->begin_ns < activity->end_ns) { metric = std::make_shared( @@ -135,7 +136,7 @@ void processActivity(RoctracerProfiler::CorrIdToExternIdMap &corrIdToExternId, const roctracer_record_t *record, bool isAPI, bool isGraph) { switch (record->kind) { - case 0x11F1: // Task - kernel enqueued by graph launch + case kHipVdiCommandTask: case kHipVdiCommandKernel: { processActivityKernel(corrIdToExternId, externId, dataSet, record, isAPI, isGraph); @@ -169,6 +170,7 @@ std::pair matchKernelCbId(uint32_t cbId) { case HIP_API_ID_hipModuleLaunchCooperativeKernel: case HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice: case HIP_API_ID_hipGraphExecDestroy: + case HIP_API_ID_hipGraphInstantiateWithFlags: case HIP_API_ID_hipGraphInstantiate: { isRuntimeApi = true; break; @@ -300,6 +302,13 @@ void RoctracerProfiler::RoctracerProfilerPimpl::apiCallback( pImpl->StreamToCaptureCount[Stream]++; break; } + case HIP_API_ID_hipGraphInstantiateWithFlags: { + hipGraph_t Graph = data->args.hipGraphInstantiateWithFlags.graph; + hipGraphExec_t GraphExec = + *(data->args.hipGraphInstantiateWithFlags.pGraphExec); + pImpl->GraphExecToGraph[GraphExec] = Graph; + break; + } case HIP_API_ID_hipGraphInstantiate: { hipGraph_t Graph = data->args.hipGraphInstantiate.graph; hipGraphExec_t GraphExec = *(data->args.hipGraphInstantiate.pGraphExec); From 00639ad9ea5dd7e1db7561531ad952fb6ad7ac6d Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Fri, 22 Nov 2024 12:05:47 +0000 Subject: [PATCH 3/5] [AMD] Uninstall pytorch-triton-rocm in CI pipeline --- .github/workflows/integration-tests.yml | 2 +- .github/workflows/integration-tests.yml.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 3c00a85d9162..ea277ec00399 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -411,7 +411,7 @@ jobs: id: amd-install-triton run: | echo "PATH is '$PATH'" - pip uninstall -y triton + pip uninstall -y triton pytorch-triton-rocm cd python ccache --zero-stats pip install -v -e '.[tests]' diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 5489945d785d..29ca38f5a8d5 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -406,7 +406,7 @@ jobs: id: amd-install-triton run: | echo "PATH is '$PATH'" - pip uninstall -y triton + pip uninstall -y triton pytorch-triton-rocm cd python ccache --zero-stats pip install -v -e '.[tests]' From 736193db1a8e331e0422ef0de5a1231c41404024 Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Fri, 22 Nov 2024 15:38:49 +0000 Subject: [PATCH 4/5] Use system clang instead of rocm's clang version --- .github/workflows/integration-tests.yml | 5 +++-- .github/workflows/integration-tests.yml.in | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index ea277ec00399..a4e791699111 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -396,9 +396,10 @@ jobs: mkdir -p ~/.ccache du -h -d 1 ~/.ccache - - name: Update PATH + - name: Update compiler to clang run: | - echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ - name: Install pip dependencies run: | python3 -m pip install --upgrade pip diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 29ca38f5a8d5..8b995b44df46 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -388,9 +388,10 @@ jobs: - *restore-build-artifacts-step - *inspect-cache-directories-step - - name: Update PATH + - name: Update compiler to clang run: | - echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ - name: Install pip dependencies run: | From e2e32905457197760f62dbbde634b1fd96b6d218 Mon Sep 17 00:00:00 2001 From: Alexander Weinrauch Date: Fri, 22 Nov 2024 17:03:03 +0000 Subject: [PATCH 5/5] [AMD] Removed dependency install which are no longer required --- .github/workflows/integration-tests.yml | 8 -------- .github/workflows/integration-tests.yml.in | 10 ---------- 2 files changed, 18 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index a4e791699111..9151560ce6b9 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -400,14 +400,6 @@ jobs: run: | export CC=/usr/bin/clang export CXX=/usr/bin/clang++ - - name: Install pip dependencies - run: | - python3 -m pip install --upgrade pip - python3 -m pip install lit - - name: Install apt dependencies - run: | - apt update - apt install ccache - name: Install Triton id: amd-install-triton run: | diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in index 8b995b44df46..6d72b65207dc 100644 --- a/.github/workflows/integration-tests.yml.in +++ b/.github/workflows/integration-tests.yml.in @@ -393,16 +393,6 @@ jobs: export CC=/usr/bin/clang export CXX=/usr/bin/clang++ - - name: Install pip dependencies - run: | - python3 -m pip install --upgrade pip - python3 -m pip install lit - - - name: Install apt dependencies - run: | - apt update - apt install ccache - - name: Install Triton id: amd-install-triton run: |