Skip to content

Commit

Permalink
update lib version and bugfix
Browse files Browse the repository at this point in the history
  • Loading branch information
yukirora authored Dec 22, 2023
1 parent 55ea25c commit 2858d84
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 11 deletions.
29 changes: 18 additions & 11 deletions dockerfile/rocm6.0.x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ FROM ${BASE_IMAGE}
# - Docker Client: 20.10.8
# ROCm:
# - ROCm: 6.0
# Pytorch:
# Lib:
# - torch: 2.0.1
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
# - hipblaslt: rocm-6.0.0(tag)
# - openmpi: 4.1.x
# - apex: 1.0.0
# Intel:
# - mlc: v3.10

Expand Down Expand Up @@ -107,26 +111,23 @@ RUN if ! command -v ofed_info >/dev/null 2>&1; then \

# Add target file to help determine which device(s) to build for
ENV ROCM_PATH=/opt/rocm
RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx1030\ngfx1100\ngfx1101\ngfx1102\n" >> ${ROCM_PATH}/bin/target.lst'
RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramecc+:xnack-\n" >> ${ROCM_PATH}/bin/target.lst'

# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
# Check if Open MPI is installed
RUN if [ -z "$(command -v mpirun)" ]; then \
echo "Open MPI not found. Installing Open MPI..." && \
cd /tmp && \
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
cd ompi && \
./autogen.pl && \
mkdir build && \
cd build && \
../configure --prefix=/usr/local --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
../configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default --enable-mpirun-prefix-by-default --enable-prte-prefix-by-default --with-rocm=/opt/rocm && \
make -j $(nproc) && \
make -j $(nproc) install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}* ;\
fi
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

# Install Intel MLC
RUN cd /tmp && \
Expand All @@ -146,6 +147,12 @@ RUN cd /opt/ && \
.. && \
make -j${NUM_MAKE_JOBS}

# Apply patch
RUN cd third_party/perftest && \
git apply ../perftest_rocm6.patch
RUN cd third_party/Megatron/Megatron-DeepSpeed && \
git apply ../megatron_deepspeed_rocm6.patch

ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
Expand All @@ -164,11 +171,11 @@ RUN apt install rocm-cmake -y && \
WORKDIR ${SB_HOME}

ADD third_party third_party
RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm
RUN make RCCL_HOME=/opt/rccl/build/ MPI_HOME=$MPI_HOME ROCBLAS_BRANCH=release/rocm-rel-6.0 HIPBLASLT_BRANCH=rocm-6.0.0 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm

ADD . .
ENV USE_HIP_DATATYPE=1
ENV CXX=/opt/rocm/bin/hipcc
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[amdworker] && \
make cppbuild && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall
39 changes: 39 additions & 0 deletions third_party/Megatron/megatron_deepspeed_rocm6.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 76086de..1533648 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
35 changes: 35 additions & 0 deletions third_party/perftest_rocm6.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
diff --git a/configure.ac b/configure.ac
index 20eceda..5b5c5ab 100755
--- a/configure.ac
+++ b/configure.ac
@@ -237,13 +237,13 @@ AC_ARG_WITH([rocm],
],
[AS_CASE([$with_rocm],
[yes|no], [],
- [CPPFLAGS="-I$with_rocm/include $CPPFLAGS"
+ [CPPFLAGS="-I$with_rocm/include -D__HIP_PLATFORM_AMD__=1 $CPPFLAGS"
LDFLAGS="-L$with_rocm/lib64 -Wl,-rpath=$with_rocm/lib64 -L$with_rocm/lib -Wl,-rpath=$with_rocm/lib -lamdhip64 $LDFLAGS"])
])

AS_IF([test "x$enable_rocm" = xyes], [
AC_DEFINE([__HIP_PLATFORM_HCC__], [1], [Enable ROCm])
- AC_CHECK_HEADERS([hip/hip_runtime_api.h], [],
+ AC_CHECK_HEADERS([/opt/rocm/include/hip/hip_runtime_api.h], [],
[AC_MSG_ERROR([cannot include hip/hip_runtime_api.h])])
AC_SEARCH_LIBS([hipFree], [amdhip64], [],
[AC_MSG_ERROR([cannot link with -lamdhip64])])
diff --git a/src/rocm_memory.c b/src/rocm_memory.c
index e9a9136..cc028c9 100644
--- a/src/rocm_memory.c
+++ b/src/rocm_memory.c
@@ -44,8 +44,8 @@ static int init_rocm(int device_id) {

hipDeviceProp_t prop = {0};
ROCM_CHECK(hipGetDeviceProperties(&prop, device_id));
- printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %d\n",
- device_id, prop.name, prop.pciBusID, prop.gcnArch);
+ printf("Using ROCm Device with ID: %d, Name: %s, PCI Bus ID: 0x%x, GCN Arch: %s\n",
+ device_id, prop.name, prop.pciBusID, prop.gcnArchName);

return SUCCESS;
}

0 comments on commit 2858d84

Please sign in to comment.