Skip to content

Commit

Permalink
Merge branch 'release/0.10' into yutji/rocm-6.0-dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
yukirora authored Dec 23, 2023
2 parents 2858d84 + c635f75 commit dc7a64e
Show file tree
Hide file tree
Showing 12 changed files with 140 additions and 75 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,22 @@ jobs:
dockerfile: cuda12.2
tags: superbench/main:cuda12.2
runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=64"
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
runner: ubuntu-latest
build_args: "NUM_MAKE_JOBS=8"
- name: rocm5.7
dockerfile: rocm5.7.x
tags: superbench/main:rocm5.7
runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=64"
- name: rocm6.0
dockerfile: rocm6.0.x
tags: superbench/main:rocm6.0
runner: [self-hosted, rocm-build]
build_args: "NUM_MAKE_JOBS=64"
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down Expand Up @@ -80,7 +84,7 @@ jobs:
fi
DOCKERFILE=dockerfile/${{ matrix.dockerfile }}.dockerfile
BUILD_ARGS="NUM_MAKE_JOBS=8"
BUILD_ARGS=${{ matrix.build_args }}
if [[ "${{ matrix.extra_args }}" ]]; then
BUILD_ARGS="${BUILD_ARGS} ${{ matrix.extra_args }}"
fi
Expand Down
53 changes: 27 additions & 26 deletions dockerfile/rocm5.7.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
Expand All @@ -27,6 +28,7 @@ RUN apt-get update && \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
Expand All @@ -38,6 +40,7 @@ RUN apt-get update && \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
Expand All @@ -46,11 +49,11 @@ RUN apt-get update && \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=16
ARG NUM_MAKE_JOBS=

# Check if CMake is installed and its version
RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \
required_version="3.26.4" && \
required_version="3.24.1" && \
if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \
echo "existing cmake version is ${cmake_version}" && \
cd /tmp && \
Expand Down Expand Up @@ -100,21 +103,9 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi

# Install UCX
ENV UCX_VERSION=1.14.1
RUN if [ -z "$(ls -A /opt/ucx)" ]; then \
echo "/opt/ucx is empty. Installing UCX..."; \
cd /tmp && \
git clone https://github.com/openucx/ucx.git -b v${UCX_VERSION} && \
cd ucx && \
./autogen.sh && \
mkdir build && \
cd build && \
../configure -prefix=$UCX_DIR --with-rocm=/opt/rocm --without-knem && \
make -j $(nproc) && make -j $(nproc) install && rm -rf /tmp/ucx-${UCX_VERSION} ; \
else \
echo "/opt/ucx is not empty. Skipping UCX installation."; \
fi
# Add target file to help determine which device(s) to build for
ENV ROCM_PATH=/opt/rocm
RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
Expand All @@ -127,7 +118,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --enable-mca-no-build=btl-uct --with-ucx=/opt/ucx --with-rocm=/opt/rocm && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
Expand All @@ -148,12 +139,18 @@ RUN cd /opt/ && \
cd rccl && \
mkdir build && \
cd build && \
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && \
CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \
-DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \
.. && \
make -j${NUM_MAKE_JOBS}

# Install AMD SMI Python Library
RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip install --user .

ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/opt/ucx/lib:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
Expand All @@ -163,13 +160,17 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7

WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm

ADD . .
RUN apt install rocm-cmake -y && \
python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -m pip install .[amdworker] && \
#ENV USE_HIPBLASLT_DATATYPE=1
ENV CXX=/opt/rocm/bin/hipcc
RUN python3 -m pip install .[amdworker] && \
make cppbuild && \
make postinstall
RUN make cppbuild
ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
11 changes: 6 additions & 5 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU.
|--------------------------------|-----------|--------------------------------|
| pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. |

### `cublaslt-gemm`
### `cublaslt-gemm` / `hipblaslt-gemm`

#### Introduction

Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul).
Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul) or [`hipblasLt-bench`](https://github.com/ROCm/hipBLASLt/blob/develop/clients/benchmarks/README.md).

#### Metrics

| Name | Unit | Description |
|----------------------------------------------------------|----------------|---------------------------------|
| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
| Name | Unit | Description |
|-----------------------------------------------------------|----------------|---------------------------------|
| cublaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
| hipblaslt-gemm/${dtype}\_${batch}\_${m}\_${n}\_${k}_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |

### `cublas-function`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
elseif(DEFINED ENV{USE_HIP_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
endif()
if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
endif()
target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
else()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,18 @@ using cublasLtHalf = hipblasLtHalf;
#if defined(USE_HIPBLASLT_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLASLT_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLASLT_R_32F
#elif defined(USE_HIP_DATATYPE)
#define DIST_INF_HIP_DATATYPE_R_16F HIP_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIP_R_32F
#else
#define DIST_INF_HIP_DATATYPE_R_16F HIPBLAS_R_16F
#define DIST_INF_HIP_DATATYPE_R_32F HIPBLAS_R_32F
#endif
#if defined(USE_HIPBLAS_COMPUTETYPE)
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLAS_COMPUTE_32F
#else
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
#endif
#else
#include <cublasLt.h>
#include <cuda_fp16.h>
Expand Down Expand Up @@ -244,8 +252,10 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matF, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));
CHECK_CUBLASLT_ERROR(hipblasLtMatrixLayoutCreate(&matG, DIST_INF_HIP_DATATYPE_R_16F, k, n, k));

CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul1, HIPBLASLT_COMPUTE_F32, DIST_INF_HIP_DATATYPE_R_32F));
CHECK_CUBLASLT_ERROR(hipblasLtMatmulDescCreate(&matmul2, HIPBLASLT_COMPUTE_F32, DIST_INF_HIP_DATATYPE_R_32F));
CHECK_CUBLASLT_ERROR(
hipblasLtMatmulDescCreate(&matmul1, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));
CHECK_CUBLASLT_ERROR(
hipblasLtMatmulDescCreate(&matmul2, DIST_INF_HIP_COMPUTETYPE_F32, DIST_INF_HIP_DATATYPE_R_32F));

hipblasOperation_t trans = HIPBLAS_OP_N;
CHECK_CUBLASLT_ERROR(
Expand Down
3 changes: 2 additions & 1 deletion superbench/benchmarks/micro_benchmarks/hipblaslt_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
raise ValueError('Invalid result')

self._result.add_result(
f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', float(fields[-2])
f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops',
float(fields[-2]) / 1000
)
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
Expand Down
3 changes: 1 addition & 2 deletions superbench/benchmarks/micro_benchmarks/rocm_common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")

if(EXISTS ${HIP_PATH})
# Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
set(CMAKE_PREFIX_PATH /opt/rocm ROCM_PATH)
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
Expand Down
Loading

0 comments on commit dc7a64e

Please sign in to comment.