From 921067ad9e613b56e9fe73ef3287b05afb7ccd91 Mon Sep 17 00:00:00 2001 From: yukirora Date: Thu, 21 Dec 2023 13:07:53 +0000 Subject: [PATCH 1/5] update hipblaslt metric unit to tflops --- docs/user-tutorial/benchmarks/micro-benchmarks.md | 11 ++++++----- .../benchmarks/micro_benchmarks/hipblaslt_function.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index 5155be7b0..388bfa119 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -58,17 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU. |--------------------------------|-----------|--------------------------------| | pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. | -### `cublaslt-gemm` +### `cublaslt-gemm` / `hipblaslt-gemm` #### Introduction -Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul). +Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul) or [`hipblasLt-bench`](https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md). #### Metrics -| Name | Unit | Description | -|----------------------------------------------------------|----------------|---------------------------------| -| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | +| Name | Unit | Description | +|-----------------------------------------------------------|----------------|---------------------------------| +| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | +| hipblaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. | ### `cublas-function` diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index 508973777..e9ba5f0e0 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -103,7 +103,7 @@ def _process_raw_result(self, cmd_idx, raw_output): raise ValueError('Invalid result') self._result.add_result( - f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) + f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) / 1000 ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) From 0d707c832ebd190c7db499b483b16b254db1c53f Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 22 Dec 2023 04:09:11 +0000 Subject: [PATCH 2/5] fix lint and CI --- .github/workflows/build-image.yml | 2 +- superbench/benchmarks/micro_benchmarks/hipblaslt_function.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 640ae47f8..96e729e56 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -108,7 +108,7 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Pull cache image - run: sudo docker pull $(cut -d, -f1 <<<${{ steps.metadata.outputs.tags }}) + run: sudo docker pull ${{ steps.metadata.outputs.tags }} continue-on-error: true - name: Login to the GitHub Container Registry uses: docker/login-action@v1 diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index e9ba5f0e0..afe220e68 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -103,7 +103,8 @@ def _process_raw_result(self, cmd_idx, raw_output): raise ValueError('Invalid result') self._result.add_result( - f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2]) / 1000 + f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', + float(fields[-2]) / 1000 ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) From 81ee5723fa0d537728581887fcf9d5d7296e4ee8 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 22 Dec 2023 05:03:41 +0000 Subject: [PATCH 3/5] fix test --- tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py index f91019f69..98c693a67 100644 --- a/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py +++ b/tests/benchmarks/micro_benchmarks/test_hipblaslt_function.py @@ -102,7 +102,7 @@ def test_hipblaslt_gemm_result_parsing(self): self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(2, len(benchmark.result)) - self.assertEqual(58624.5, benchmark.result['fp16_1_896_896_896_flops'][0]) + self.assertEqual(58.6245, benchmark.result['fp16_1_896_896_896_flops'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'HipBLAS API failed')) From 33049b8d4cafb1f268b4e13cf9d21cf071a1a083 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 22 Dec 2023 14:15:39 +0800 Subject: [PATCH 4/5] Update build-image.yml --- .github/workflows/build-image.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 96e729e56..0e80c725c 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -29,6 +29,7 @@ jobs: dockerfile: cuda12.2 tags: superbench/main:cuda12.2 runner: [self-hosted, rocm-build] + extra_args: "NUM_MAKE_JOBS=64" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest @@ -37,6 +38,7 @@ jobs: dockerfile: rocm5.7.x tags: superbench/main:rocm5.7 runner: [self-hosted, rocm-build] + extra_args: "NUM_MAKE_JOBS=64" steps: - name: Checkout uses: actions/checkout@v2 From 1cac34e5238b903903b606aaca880bbb9b95f6a3 Mon Sep 17 00:00:00 2001 From: yukirora Date: Fri, 22 Dec 2023 06:46:58 +0000 Subject: [PATCH 5/5] fix pipeline --- .github/workflows/build-image.yml | 7 ++++--- third_party/Makefile | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 0e80c725c..0f6dd995c 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -29,16 +29,17 @@ jobs: dockerfile: cuda12.2 tags: superbench/main:cuda12.2 runner: [self-hosted, rocm-build] - extra_args: "NUM_MAKE_JOBS=64" + build_args: "NUM_MAKE_JOBS=64" - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest runner: ubuntu-latest + build_args: "NUM_MAKE_JOBS=8" - name: rocm5.7 dockerfile: rocm5.7.x tags: superbench/main:rocm5.7 runner: [self-hosted, rocm-build] - extra_args: "NUM_MAKE_JOBS=64" + build_args: "NUM_MAKE_JOBS=64" steps: - name: Checkout uses: actions/checkout@v2 @@ -78,7 +79,7 @@ jobs: fi DOCKERFILE=dockerfile/${{ matrix.dockerfile }}.dockerfile - BUILD_ARGS="NUM_MAKE_JOBS=8" + BUILD_ARGS=${{ matrix.build_args }} if [[ "${{ matrix.extra_args }}" ]]; then BUILD_ARGS="${BUILD_ARGS} ${{ matrix.extra_args }}" fi diff --git a/third_party/Makefile b/third_party/Makefile index 032542e64..b69259da2 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -216,21 +216,21 @@ apex_rocm: cuda_msccl: sb_micro_path ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile)) cd ./msccl/executor/msccl-executor-nccl && \ - make -j4 src.build && \ + make -j $(shell nproc --ignore=2) src.build && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \ cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/ endif ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile)) cd ./msccl/scheduler/msccl-scheduler && \ - CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j4 && \ + CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j $(shell nproc --ignore=2) && \ cd ../../.. mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \ cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/ endif ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile)) cd ./msccl/tests/msccl-tests-nccl && \ - make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j4 && cd ../../.. + make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j $(shell nproc --ignore=2) && cd ../../.. mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \ cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/ endif