From 407b12ef2160779f4f460eb5db50d262fac90e6e Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 21 Dec 2023 10:14:43 +0800 Subject: [PATCH 01/14] add rocm6.0 dockerfile --- dockerfile/rocm6.0.x.dockerfile | 174 ++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 dockerfile/rocm6.0.x.dockerfile diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile new file mode 100644 index 000000000..3748e936a --- /dev/null +++ b/dockerfile/rocm6.0.x.dockerfile @@ -0,0 +1,174 @@ +ARG BASE_IMAGE=rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1 + +FROM ${BASE_IMAGE} + +# OS: +# - Ubuntu: 22.04 +# - Docker Client: 20.10.8 +# ROCm: +# - ROCm: 6.0 +# Pytorch: +# - torch: 2.0.1 +# Intel: +# - mlc: v3.10 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get -q install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + git \ + hipify-clang \ + iproute2 \ + jq \ + libaio-dev \ + libboost-program-options-dev \ + libcap2 \ + libcurl4-openssl-dev \ + libnuma-dev \ + libpci-dev \ + libssl-dev \ + libtinfo5 \ + libtool \ + lshw \ + net-tools \ + numactl \ + openssh-client \ + openssh-server \ + pciutils \ + python3-mpi4py \ + rsync \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + rm -rf /tmp/* + +ARG NUM_MAKE_JOBS=64 + +# Check if CMake is installed and its version +RUN cmake_version=$(cmake --version 2>/dev/null | grep -oP "(?<=cmake version )(\d+\.\d+)" || echo "0.0") && \ + required_version="3.24.1" && \ + if [ "$(printf "%s\n" "$required_version" "$cmake_version" | sort -V | head -n 1)" != "$required_version" ]; then \ + echo "existing cmake version is ${cmake_version}" && \ + cd /tmp && \ + wget -q https://github.com/Kitware/CMake/releases/download/v${required_version}/cmake-${required_version}.tar.gz && \ + tar xzf cmake-${required_version}.tar.gz && \ + cd cmake-${required_version} && \ + ./bootstrap --prefix=/usr --no-system-curl --parallel=16 && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + rm -rf /tmp/cmake-${required_version}* \ + else \ + echo "CMake version is greater than or equal to 3.23"; \ + fi + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + + +# Get Ubuntu version and set as an environment variable +RUN export UBUNTU_VERSION=$(lsb_release -r -s) +RUN echo "Ubuntu version: $UBUNTU_VERSION" +ENV UBUNTU_VERSION=${UBUNTU_VERSION} + +# Install OFED +ENV OFED_VERSION=5.9-0.5.6.0 +# Check if ofed_info is present and has a version +RUN if ! command -v ofed_info >/dev/null 2>&1; then \ + echo "OFED not found. Installing OFED..."; \ + cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \ + PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \ + fi + +# Add target file to help determine which device(s) to build for +ENV ROCM_PATH=/opt/rocm +RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst' + +# Install OpenMPI +ENV OPENMPI_VERSION=4.1.x +# Check if Open MPI is installed +RUN [ -d /usr/local/bin/mpirun ] || { \ + echo "Open MPI not found. Installing Open MPI..." && \ + cd /tmp && \ + git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ + cd ompi && \ + ./autogen.pl && \ + mkdir build && \ + cd build && \ + ../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + make -j $(nproc) && \ + make -j $(nproc) install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\ + } + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +# Install RCCL +RUN cd /opt/ && \ + git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \ + cd rccl && \ + mkdir build && \ + cd build && \ + CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ + make -j${NUM_MAKE_JOBS} + +ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ + LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ + LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +RUN apt install rocm-cmake -y && \ + python3 -m pip install --upgrade pip wheel setuptools==65.7 + +WORKDIR ${SB_HOME} + +ADD third_party third_party +RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm + +ADD . . +#ENV USE_HIPBLASLT_DATATYPE=1 +ENV CXX=/opt/rocm/bin/hipcc +RUN python3 -m pip install .[amdworker] && \ + make cppbuild && \ + make postinstall From 3d00250773d0364b32468ea6cd4365e28fd54c98 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 21 Dec 2023 10:16:51 +0800 Subject: [PATCH 02/14] add pipeline for rocm6.0 --- .github/workflows/build-image.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 96e729e56..94d53d9a5 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -37,6 +37,10 @@ jobs: dockerfile: rocm5.7.x tags: superbench/main:rocm5.7 runner: [self-hosted, rocm-build] + - name: rocm6.0 + dockerfile: rocm6.0.x + tags: superbench/main:rocm6.0 + runner: [self-hosted, rocm-build] steps: - name: Checkout uses: actions/checkout@v2 From 55ea25c95656bd110ec638186e8e7cb821dd261a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 21 Dec 2023 21:17:33 +0800 Subject: [PATCH 03/14] bug fix --- dockerfile/rocm6.0.x.dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 3748e936a..35d87a749 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1 +ARG BASE_IMAGE=rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1 FROM ${BASE_IMAGE} @@ -112,7 +112,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10 # Install OpenMPI ENV OPENMPI_VERSION=4.1.x # Check if Open MPI is installed -RUN [ -d /usr/local/bin/mpirun ] || { \ +RUN if [ -z "$(command -v mpirun)" ]; then \ echo "Open MPI not found. Installing Open MPI..." && \ cd /tmp && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ @@ -126,7 +126,7 @@ RUN [ -d /usr/local/bin/mpirun ] || { \ ldconfig && \ cd / && \ rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\ - } + fi # Install Intel MLC RUN cd /tmp && \ @@ -164,10 +164,10 @@ RUN apt install rocm-cmake -y && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . -#ENV USE_HIPBLASLT_DATATYPE=1 +ENV USE_HIP_DATATYPE=1 ENV CXX=/opt/rocm/bin/hipcc RUN python3 -m pip install .[amdworker] && \ make cppbuild && \ From 2858d846f6e4876c877ba57be0b3433d77ad38b2 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 22 Dec 2023 20:43:06 +0800 Subject: [PATCH 04/14] update lib version and bugfix --- dockerfile/rocm6.0.x.dockerfile | 29 ++++++++------ .../Megatron/megatron_deepspeed_rocm6.patch | 39 +++++++++++++++++++ third_party/perftest_rocm6.patch | 35 +++++++++++++++++ 3 files changed, 92 insertions(+), 11 deletions(-) create mode 100644 third_party/Megatron/megatron_deepspeed_rocm6.patch create mode 100644 third_party/perftest_rocm6.patch diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 35d87a749..ddea3c103 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -7,8 +7,12 @@ FROM ${BASE_IMAGE} # - Docker Client: 20.10.8 # ROCm: # - ROCm: 6.0 -# Pytorch: +# Lib: # - torch: 2.0.1 +# - rccl: 2.18.3+hip6.0 develop:7e1cbb4 +# - hipblaslt: rocm-6.0.0(tag) +# - openmpi: 4.1.x +# - apex: 1.0.0 # Intel: # - mlc: v3.10 @@ -107,26 +111,23 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \ # Add target file to help determine which device(s) to build for ENV ROCM_PATH=/opt/rocm -RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst' +RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramecc+:xnack-\n" >> ${ROCM_PATH}/bin/target.lst' # Install OpenMPI ENV OPENMPI_VERSION=4.1.x # Check if Open MPI is installed -RUN if [ -z "$(command -v mpirun)" ]; then \ - echo "Open MPI not found. Installing Open MPI..." && \ - cd /tmp && \ +RUN cd /tmp && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ cd ompi && \ ./autogen.pl && \ mkdir build && \ cd build && \ - ../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ make -j $(nproc) && \ make -j $(nproc) install && \ ldconfig && \ cd / && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\ - fi + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* # Install Intel MLC RUN cd /tmp && \ @@ -146,6 +147,12 @@ RUN cd /opt/ && \ .. && \ make -j${NUM_MAKE_JOBS} +# Apply patch +RUN cd third_party/perftest && \ + git apply ../perftest_rocm6.patch +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch + ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ @@ -164,11 +171,11 @@ RUN apt install rocm-cmake -y && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=rocm-6.0.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . ENV USE_HIP_DATATYPE=1 -ENV CXX=/opt/rocm/bin/hipcc +ENV USE_HIPBLAS_COMPUTETYPE=1 RUN python3 -m pip install .[amdworker] && \ - make cppbuild && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall diff --git a/third_party/Megatron/megatron_deepspeed_rocm6.patch b/third_party/Megatron/megatron_deepspeed_rocm6.patch new file mode 100644 index 000000000..39a1dc27b --- /dev/null +++ b/third_party/Megatron/megatron_deepspeed_rocm6.patch @@ -0,0 +1,39 @@ +diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu +index 76086de..1533648 100644 +--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu ++++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu +@@ -4,7 +4,7 @@ + #include + #include + #include +-#ifndef __HIP_PLATFORM_HCC__ ++#ifndef __HIP_PLATFORM_AMD__ + #include + #endif + #include +diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu +index 90e1c9f..d217aec 100644 +--- a/megatron/fused_kernels/scaled_softmax_cuda.cu ++++ b/megatron/fused_kernels/scaled_softmax_cuda.cu +@@ -4,7 +4,7 @@ + #include + #include + #include +-#ifndef __HIP_PLATFORM_HCC__ ++#ifndef __HIP_PLATFORM_AMD__ + #include + #endif + #include +diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +index 74c9f3d..03b5fc8 100644 +--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu ++++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +@@ -4,7 +4,7 @@ + #include + #include + #include +-#ifndef __HIP_PLATFORM_HCC__ ++#ifndef __HIP_PLATFORM_AMD__ + #include + #endif + #include diff --git a/third_party/perftest_rocm6.patch b/third_party/perftest_rocm6.patch new file mode 100644 index 000000000..3394e5b68 --- /dev/null +++ b/third_party/perftest_rocm6.patch @@ -0,0 +1,35 @@ +diff --git a/configure.ac b/configure.ac +index 20eceda..5b5c5ab 100755 +--- a/configure.ac ++++ b/configure.ac +@@ -237,13 +237,13 @@ AC_ARG_WITH([rocm], + ], + [AS_CASE([$with_rocm], + [yes|no], [], +- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS" ++ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS" + LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"]) + ]) + + AS_IF([test "x$enable_rocm" = xyes], [ + AC_DEFINE([__HIP_PLATFORM_HCC__], [1], [Enable ROCm]) +- AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], ++ AC_CHECK_HEADERS([/opt/rocm/include/hip/hip_runtime_api.h], [], + [AC_MSG_ERROR([cannot include hip/hip_runtime_api.h])]) + AC_SEARCH_LIBS([hipFree], [amdhip64], [], + [AC_MSG_ERROR([cannot link with -lamdhip64])]) +diff --git a/src/rocm_memory.c b/src/rocm_memory.c +index e9a9136..cc028c9 100644 +--- a/src/rocm_memory.c ++++ b/src/rocm_memory.c +@@ -44,8 +44,8 @@ static int init_rocm(int device_id) { + + hipDeviceProp_t prop = {0}; + ROCM_CHECK(hipGetDeviceProperties(&prop, device_id)); +- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n", +- device_id, prop.name, prop.pciBusID, prop.gcnArch); ++ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n", ++ device_id, prop.name, prop.pciBusID, prop.gcnArchName); + + return SUCCESS; + } From fabaccbbec133c4243c38896f270fd2e7cbcc340 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 11:15:32 +0800 Subject: [PATCH 05/14] bugfix --- dockerfile/rocm6.0.x.dockerfile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index ddea3c103..4f63be1fd 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -10,7 +10,7 @@ FROM ${BASE_IMAGE} # Lib: # - torch: 2.0.1 # - rccl: 2.18.3+hip6.0 develop:7e1cbb4 -# - hipblaslt: rocm-6.0.0(tag) +# - hipblaslt: 950ca43 # - openmpi: 4.1.x # - apex: 1.0.0 # Intel: @@ -147,12 +147,6 @@ RUN cd /opt/ && \ .. && \ make -j${NUM_MAKE_JOBS} -# Apply patch -RUN cd third_party/perftest && \ - git apply ../perftest_rocm6.patch -RUN cd third_party/Megatron/Megatron-DeepSpeed && \ - git apply ../megatron_deepspeed_rocm6.patch - ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ @@ -171,7 +165,12 @@ RUN apt install rocm-cmake -y && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=rocm-6.0.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +# Apply patch +RUN cd third_party/perftest && \ + git apply ../perftest_rocm6.patch +RUN cd third_party/Megatron/Megatron-DeepSpeed && \ + git apply ../megatron_deepspeed_rocm6.patch +RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . ENV USE_HIP_DATATYPE=1 From f34c44168de983e2945180f50e797e16bb0a62c1 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 11:16:04 +0800 Subject: [PATCH 06/14] reivse hipblaslt build --- third_party/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/third_party/Makefile b/third_party/Makefile index b69259da2..3dbe152d6 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -7,6 +7,7 @@ MPI_HOME ?= /usr/local/mpi HIP_HOME ?= /opt/rocm/hip RCCL_HOME ?= /opt/rocm/rccl HPCX_HOME ?= /opt/hpcx +ROCM_PATH ?= /opt/rocm CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) @@ -113,6 +114,10 @@ rocm_rocblas: sb_micro_path rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ + rm -rf ${ROCM_PATH}/lib/hipblaslt \ + rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt \ + rm -rf ${ROCM_PATH}/include/hipblaslt \ + rm -rf ${ROCM_PATH}/share/doc/hipblaslt \ git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ cd ./hipBLASLt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ From b19e774300653563ac3a055de7412a3cdd617bcb Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 11:54:26 +0800 Subject: [PATCH 07/14] bugfix --- third_party/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/third_party/Makefile b/third_party/Makefile index 3dbe152d6..3b362ed5d 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -114,10 +114,10 @@ rocm_rocblas: sb_micro_path rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ - rm -rf ${ROCM_PATH}/lib/hipblaslt \ - rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt \ - rm -rf ${ROCM_PATH}/include/hipblaslt \ - rm -rf ${ROCM_PATH}/share/doc/hipblaslt \ + rm -rf ${ROCM_PATH}/lib/hipblaslt; \ + rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt; \ + rm -rf ${ROCM_PATH}/include/hipblaslt; \ + rm -rf ${ROCM_PATH}/share/doc/hipblaslt; \ git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ cd ./hipBLASLt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ From 28eee191106149bd4f964f85d8d136a5051d8594 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 13:01:16 +0800 Subject: [PATCH 08/14] update --- dockerfile/rocm6.0.x.dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 4f63be1fd..4f0a28f9d 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -173,8 +173,7 @@ RUN cd third_party/Megatron/Megatron-DeepSpeed && \ RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . -ENV USE_HIP_DATATYPE=1 -ENV USE_HIPBLAS_COMPUTETYPE=1 +ENV USE_HIPBLASLT_DATATYPE=1 RUN python3 -m pip install .[amdworker] && \ CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall From 0ff39d33ed0f16546139de3cd1b1421af516ae36 Mon Sep 17 00:00:00 2001 From: Andy li Date: Mon, 25 Dec 2023 09:04:51 +0000 Subject: [PATCH 09/14] update perftest version --- third_party/perftest | 2 +- third_party/perftest_rocm6.patch | 63 ++++++++++++++------------------ 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/third_party/perftest b/third_party/perftest index 5fb4f10a7..dffd1dd8b 160000 --- a/third_party/perftest +++ b/third_party/perftest @@ -1 +1 @@ -Subproject commit 5fb4f10a7e7827ed15e53c25810a10be279d6e23 +Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc diff --git a/third_party/perftest_rocm6.patch b/third_party/perftest_rocm6.patch index 3394e5b68..fe15bd250 100644 --- a/third_party/perftest_rocm6.patch +++ b/third_party/perftest_rocm6.patch @@ -1,35 +1,28 @@ -diff --git a/configure.ac b/configure.ac -index 20eceda..5b5c5ab 100755 ---- a/configure.ac -+++ b/configure.ac -@@ -237,13 +237,13 @@ AC_ARG_WITH([rocm], - ], - [AS_CASE([$with_rocm], - [yes|no], [], -- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS" -+ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS" - LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"]) - ]) - - AS_IF([test "x$enable_rocm" = xyes], [ - AC_DEFINE([__HIP_PLATFORM_HCC__], [1], [Enable ROCm]) -- AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], -+ AC_CHECK_HEADERS([/opt/rocm/include/hip/hip_runtime_api.h], [], - [AC_MSG_ERROR([cannot include hip/hip_runtime_api.h])]) - AC_SEARCH_LIBS([hipFree], [amdhip64], [], - [AC_MSG_ERROR([cannot link with -lamdhip64])]) -diff --git a/src/rocm_memory.c b/src/rocm_memory.c -index e9a9136..cc028c9 100644 ---- a/src/rocm_memory.c -+++ b/src/rocm_memory.c -@@ -44,8 +44,8 @@ static int init_rocm(int device_id) { - - hipDeviceProp_t prop = {0}; - ROCM_CHECK(hipGetDeviceProperties(&prop, device_id)); -- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n", -- device_id, prop.name, prop.pciBusID, prop.gcnArch); -+ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n", -+ device_id, prop.name, prop.pciBusID, prop.gcnArchName); - - return SUCCESS; - } +diff --git a/configure.ac b/configure.ac +index 20eceda..c8f0c07 100755 +--- a/configure.ac ++++ b/configure.ac +@@ -237,7 +237,7 @@ AC_ARG_WITH([rocm], + ], + [AS_CASE([$with_rocm], + [yes|no], [], +- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS" ++ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS" + LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"]) + ]) + +diff --git a/src/rocm_memory.c b/src/rocm_memory.c +index e9a9136..b6cb23a 100644 +--- a/src/rocm_memory.c ++++ b/src/rocm_memory.c +@@ -44,8 +44,8 @@ static int init_rocm(int device_id) { + + hipDeviceProp_t prop = {0}; + ROCM_CHECK(hipGetDeviceProperties(&prop, device_id)); +- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n", +- device_id, prop.name, prop.pciBusID, prop.gcnArch); ++ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n", ++ device_id, prop.name, prop.pciBusID, prop.gcnArchName); + + return SUCCESS; + } From 692020e80b9681b62c7677e02bfa23e3fe89c1f7 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 21:10:56 +0800 Subject: [PATCH 10/14] bugfix --- dockerfile/rocm5.7.x.dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index ce87e9fc6..b38992ee2 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -166,11 +166,13 @@ RUN apt install rocm-cmake -y && \ WORKDIR ${SB_HOME} ADD third_party third_party -RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release-staging/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +# Apply patch +RUN cd third_party/perftest && \ + git apply ../perftest_rocm6.patch +RUN make ROCM_PATH=/opt/rocm-5.7.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . #ENV USE_HIPBLASLT_DATATYPE=1 -ENV CXX=/opt/rocm/bin/hipcc RUN python3 -m pip install .[amdworker] && \ - make cppbuild && \ + CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall From d5cf4e08e2919079a8c8a4ecfe6fbf2257a9ddaf Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 21:37:20 +0800 Subject: [PATCH 11/14] update --- dockerfile/rocm6.0.x.dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 4f0a28f9d..18bcf4ade 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -168,9 +168,10 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch +RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o megatron_deepspeed RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch -RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN make -C third_party megatron_deepspeed ADD . . ENV USE_HIPBLASLT_DATATYPE=1 From fd87f1a9fe6aa00432f80b73bc9da466566ef0a4 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 25 Dec 2023 21:52:28 +0800 Subject: [PATCH 12/14] update mpi build to replace the original static mpi build in base image --- dockerfile/rocm5.7.x.dockerfile | 11 ++++------- dockerfile/rocm6.0.x.dockerfile | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index b38992ee2..727e0db26 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -110,21 +110,18 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10 # Install OpenMPI ENV OPENMPI_VERSION=4.1.x # Check if Open MPI is installed -RUN [ -d /usr/local/bin/mpirun ] || { \ - echo "Open MPI not found. Installing Open MPI..." && \ - cd /tmp && \ +RUN cd /tmp && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ cd ompi && \ ./autogen.pl && \ mkdir build && \ cd build && \ - ../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ + ../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \ make -j $(nproc) && \ make -j $(nproc) install && \ ldconfig && \ cd / && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\ - } + rm -rf /tmp/openmpi-${OPENMPI_VERSION}* # Install Intel MLC RUN cd /tmp && \ @@ -169,7 +166,7 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch -RUN make ROCM_PATH=/opt/rocm-5.7.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=/usr/local ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN make ROCM_PATH=/opt/rocm-5.7.0 RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . #ENV USE_HIPBLASLT_DATATYPE=1 diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 18bcf4ade..6ccd37c84 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -168,7 +168,7 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch -RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o megatron_deepspeed +RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o megatron_deepspeed RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch RUN make -C third_party megatron_deepspeed From 61485915a1a609b4cc45490383847123f8dc0842 Mon Sep 17 00:00:00 2001 From: Andy li Date: Tue, 26 Dec 2023 14:18:36 +0000 Subject: [PATCH 13/14] bugfix --- third_party/Makefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/third_party/Makefile b/third_party/Makefile index 3b362ed5d..a98876839 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -12,6 +12,7 @@ ROCM_PATH ?= /opt/rocm CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) +ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm @@ -114,10 +115,12 @@ rocm_rocblas: sb_micro_path rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ + if [ "$$(expr $(ROCM_VER) \>= 6.0)" -eq 1 ]; then \ rm -rf ${ROCM_PATH}/lib/hipblaslt; \ - rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt; \ - rm -rf ${ROCM_PATH}/include/hipblaslt; \ - rm -rf ${ROCM_PATH}/share/doc/hipblaslt; \ + rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt; \ + rm -rf ${ROCM_PATH}/include/hipblaslt; \ + rm -rf ${ROCM_PATH}/share/doc/hipblaslt; \ + fi; \ git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ cd ./hipBLASLt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \ From e6052dab2b515ee8a7db8c27e6b2eb04e3d870f4 Mon Sep 17 00:00:00 2001 From: yukirora Date: Wed, 27 Dec 2023 06:53:06 +0000 Subject: [PATCH 14/14] resotre hipblaslt --- dockerfile/rocm5.7.x.dockerfile | 6 +++--- dockerfile/rocm6.0.x.dockerfile | 10 +++++----- third_party/Makefile | 6 ------ 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile index 727e0db26..6f2ede3a7 100644 --- a/dockerfile/rocm5.7.x.dockerfile +++ b/dockerfile/rocm5.7.x.dockerfile @@ -137,8 +137,8 @@ RUN cd /opt/ && \ mkdir build && \ cd build && \ CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ - -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ - .. && \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ make -j${NUM_MAKE_JOBS} # Install AMD SMI Python Library @@ -166,7 +166,7 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch -RUN make ROCM_PATH=/opt/rocm-5.7.0 RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-5.7.1.1 HIPBLASLT_BRANCH=release/rocm-rel-5.7 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm ADD . . #ENV USE_HIPBLASLT_DATATYPE=1 diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile index 6ccd37c84..55f0d2b52 100644 --- a/dockerfile/rocm6.0.x.dockerfile +++ b/dockerfile/rocm6.0.x.dockerfile @@ -143,8 +143,8 @@ RUN cd /opt/ && \ mkdir build && \ cd build && \ CXX=/opt/rocm/bin/hipcc cmake -DHIP_COMPILER=clang -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE=1 \ - -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ - .. && \ + -DCMAKE_PREFIX_PATH="${ROCM_PATH}/hsa;${ROCM_PATH}/hip;${ROCM_PATH}/share/rocm/cmake/;${ROCM_PATH}" \ + .. && \ make -j${NUM_MAKE_JOBS} ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ @@ -168,13 +168,13 @@ ADD third_party third_party # Apply patch RUN cd third_party/perftest && \ git apply ../perftest_rocm6.patch -RUN make ROCM_PATH=/opt/rocm-6.0.0 RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=develop ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o megatron_deepspeed +RUN make RCCL_HOME=/opt/rccl/build/ ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm RUN cd third_party/Megatron/Megatron-DeepSpeed && \ git apply ../megatron_deepspeed_rocm6.patch -RUN make -C third_party megatron_deepspeed ADD . . -ENV USE_HIPBLASLT_DATATYPE=1 +ENV USE_HIP_DATATYPE=1 +ENV USE_HIPBLAS_COMPUTETYPE=1 RUN python3 -m pip install .[amdworker] && \ CXX=/opt/rocm/bin/hipcc make cppbuild && \ make postinstall diff --git a/third_party/Makefile b/third_party/Makefile index a98876839..1b9658548 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -115,12 +115,6 @@ rocm_rocblas: sb_micro_path rocm_hipblaslt: sb_micro_path @if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \ if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \ - if [ "$$(expr $(ROCM_VER) \>= 6.0)" -eq 1 ]; then \ - rm -rf ${ROCM_PATH}/lib/hipblaslt; \ - rm -rf ${ROCM_PATH}/lib/cmake/hipblaslt; \ - rm -rf ${ROCM_PATH}/include/hipblaslt; \ - rm -rf ${ROCM_PATH}/share/doc/hipblaslt; \ - fi; \ git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \ cd ./hipBLASLt && ./install.sh -dc; \ cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/; \