Skip to content

Commit

Permalink
Bug fixes (#387)
Browse files Browse the repository at this point in the history
- #343 
    - Added a test to confirm the fix
    - Pull GDRCopy from master for bug fix to be compatible with newer NVIDIA driver versions
- OpenMPI missing hcoll lib path
  • Loading branch information
LiquidPT authored Oct 18, 2024
1 parent 6c3e6bb commit 8d63e5a
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 45 deletions.
17 changes: 7 additions & 10 deletions alma/common/install_gdrcopy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@ source ${COMMON_DIR}/utilities.sh
# Install gdrcopy
gdrcopy_metadata=$(get_component_config "gdrcopy")
GDRCOPY_VERSION=$(jq -r '.version' <<< $gdrcopy_metadata)
GDRCOPY_SHA256=$(jq -r '.sha256' <<< $gdrcopy_metadata)
GDRCOPY_COMMIT=$(jq -r '.commit' <<< $gdrcopy_metadata)
GDRCOPY_DISTRIBUTION=$(jq -r '.distribution' <<< $gdrcopy_metadata)

TARBALL="v${GDRCOPY_VERSION}.tar.gz"
GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL}
git clone https://github.com/NVIDIA/gdrcopy.git
pushd gdrcopy/packages/
git checkout ${GDRCOPY_COMMIT}

${COMMON_DIR}/download_and_verify.sh $GDRCOPY_DOWNLOAD_URL $GDRCOPY_SHA256
tar -xvf $TARBALL

pushd gdrcopy-${GDRCOPY_VERSION}/packages/
CUDA=/usr/local/cuda ./build-rpm-packages.sh
rpm -Uvh gdrcopy-kmod-${GDRCOPY_VERSION}-1dkms.${GDRCOPY_DISTRIBUTION}.noarch.rpm
rpm -Uvh gdrcopy-${GDRCOPY_VERSION}-1.${GDRCOPY_DISTRIBUTION}.x86_64.rpm
rpm -Uvh gdrcopy-devel-${GDRCOPY_VERSION}-1.${GDRCOPY_DISTRIBUTION}.noarch.rpm
rpm -Uvh gdrcopy-kmod-${GDRCOPY_VERSION}dkms.${GDRCOPY_DISTRIBUTION}.noarch.rpm
rpm -Uvh gdrcopy-${GDRCOPY_VERSION}.${GDRCOPY_DISTRIBUTION}.x86_64.rpm
rpm -Uvh gdrcopy-devel-${GDRCOPY_VERSION}.${GDRCOPY_DISTRIBUTION}.noarch.rpm
sed -i "$ s/$/ gdrcopy*/" /etc/dnf/dnf.conf
popd

Expand Down
2 changes: 1 addition & 1 deletion alma/common/install_mpis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ cat << EOF >> ${MPI_MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION}
conflict mpi
module load ${GCC_VERSION}
prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin
prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib
prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib:${HCOLL_PATH}/lib
prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man
setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin
setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include
Expand Down
17 changes: 15 additions & 2 deletions tests/test-definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ function check_exit_code {
fi
}

function ver {
printf "%03d%03d%03d" $(echo "$1" | tr '.' ' ');
}

# verify OFED installation
function verify_ofed_installation {
# verify OFED installation
Expand Down Expand Up @@ -99,8 +103,8 @@ function verify_ompi_installation {

function verify_cuda_installation {
# Verify NVIDIA Driver installation
nvidia-smi
check_exit_code "Nvidia Driver ${VERSION_NVIDIA}" "Failed to run Nvidia SMI"
nvidia_driver_cuda_version=$(nvidia-smi --version | tail -n 1 | awk -F':' '{print $2}' | tr -d "[:space:]")
check_exit_code "NVIDIA Driver ${VERSION_NVIDIA}" "Failed to run NVIDIA SMI"

# Verify if NVIDIA peer memory module is inserted
lsmod | grep nvidia_peermem
Expand All @@ -112,6 +116,15 @@ function verify_cuda_installation {
# check_exit_code "CUDA Driver ${VERSION_CUDA}" "CUDA not installed"
check_exists "/usr/local/cuda/"

# Check that the CUDA runtime version isn't newer than the driver CUDA version.
# Having a newer CUDA runtime breaks gpu-burn
if [[ $(ver ${VERSION_CUDA}) -gt $(ver ${nvidia_driver_cuda_version}) ]]; then
echo "*** Error - CUDA runtime version ${VERSION_CUDA} is newer than the driver CUDA version ${nvidia_driver_cuda_version}"
exit -1
else
echo "[OK] : CUDA runtime version ${VERSION_CUDA} is compatible with the driver CUDA version ${nvidia_driver_cuda_version}"
fi

# Verify the compilation of CUDA samples
/usr/local/cuda/samples/0_Introduction/mergeSort/mergeSort
check_exit_code "CUDA Samples ${VERSION_CUDA}" "Failed to perform merge sort using CUDA Samples"
Expand Down
11 changes: 4 additions & 7 deletions ubuntu/common/install_gdrcopy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,16 @@ apt install -y build-essential devscripts debhelper check libsubunit-dev fakeroo

gdrcopy_metadata=$(get_component_config "gdrcopy")
GDRCOPY_VERSION=$(jq -r '.version' <<< $gdrcopy_metadata)
GDRCOPY_SHA256=$(jq -r '.sha256' <<< $gdrcopy_metadata)
GDRCOPY_COMMIT=$(jq -r '.commit' <<< $gdrcopy_metadata)
GDRCOPY_DISTRIBUTION=$(jq -r '.distribution' <<< $gdrcopy_metadata)

cuda_metadata=$(get_component_config "cuda")
CUDA_DRIVER_VERSION=$(jq -r '.driver.version' <<< $cuda_metadata)

TARBALL="v${GDRCOPY_VERSION}.tar.gz"
GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL}
git clone https://github.com/NVIDIA/gdrcopy.git
pushd gdrcopy/packages/
git checkout ${GDRCOPY_COMMIT}

${COMMON_DIR}/download_and_verify.sh $GDRCOPY_DOWNLOAD_URL $GDRCOPY_SHA256
tar -xvf $TARBALL

pushd gdrcopy-${GDRCOPY_VERSION}/packages/
CUDA=/usr/local/cuda ./build-deb-packages.sh
dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}_amd64.${GDRCOPY_DISTRIBUTION}.deb
apt-mark hold gdrdrv-dkms
Expand Down
2 changes: 1 addition & 1 deletion ubuntu/common/install_mpis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ cat << EOF >> ${MPI_MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION}
#
conflict mpi
prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin
prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib
prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib:${HCOLL_PATH}/lib
prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man
setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin
setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include
Expand Down
48 changes: 24 additions & 24 deletions versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,46 +73,46 @@
"nvidia": {
"ubuntu20.04": {
"driver": {
"version": "550.90.07",
"sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733"
"version": "560.35.03",
"sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3"
},
"fabricmanager": {
"distribution": "ubuntu2004",
"version": "550.90.07-1",
"sha256": "f8321f47875fa79968cda38c77c1723ff9fa3ca010f250f17b0075d84f443058"
"version": "560.35.03-1",
"sha256": "ad68065a83e2d3a5b3c3b8121fdc146c2aea5cdcca77b1ab4f9a64ac932be966"
}
},
"ubuntu22.04": {
"driver": {
"version": "550.90.07",
"sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733"
"version": "560.35.03",
"sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3"
},
"fabricmanager": {
"distribution": "ubuntu2204",
"version": "550.90.07-1",
"sha256": "f8321f47875fa79968cda38c77c1723ff9fa3ca010f250f17b0075d84f443058"
"version": "560.35.03-1",
"sha256": "ad68065a83e2d3a5b3c3b8121fdc146c2aea5cdcca77b1ab4f9a64ac932be966"
}
},
"almalinux8.7": {
"driver": {
"version": "550.90.07",
"sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733"
"version": "560.35.03",
"sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3"
},
"fabricmanager": {
"distribution": "rhel8",
"version": "550.90.07-1",
"sha256": "e580c8b412de7a9f352c2eacce877b4cbccd494bfdee7a0e6c94010a06c91c92"
"version": "560.35.03-1",
"sha256": "881a686f3214fbad464404f6c71dace656cdca841e3abdfd1b04a8e310e797ad"
}
},
"almalinux8.10": {
"driver": {
"version": "550.90.07",
"sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733"
"version": "560.35.03",
"sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3"
},
"fabricmanager": {
"distribution": "rhel8",
"version": "550.90.07-1",
"sha256": "e580c8b412de7a9f352c2eacce877b4cbccd494bfdee7a0e6c94010a06c91c92"
"version": "560.35.03-1",
"sha256": "881a686f3214fbad464404f6c71dace656cdca841e3abdfd1b04a8e310e797ad"
}
}
},
Expand Down Expand Up @@ -160,23 +160,23 @@
},
"gdrcopy": {
"ubuntu20.04": {
"version": "2.4.1",
"sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed",
"version": "2.5-1",
"commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532",
"distribution": "Ubuntu20_04"
},
"ubuntu22.04": {
"version": "2.4.1",
"sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed",
"version": "2.5-1",
"commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532",
"distribution": "Ubuntu22_04"
},
"almalinux8.7": {
"version": "2.4.1",
"sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed",
"version": "2.5-1",
"commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532",
"distribution": "el8"
},
"almalinux8.10": {
"version": "2.4.1",
"sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed",
"version": "2.5-1",
"commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532",
"distribution": "el8"
}
},
Expand Down

0 comments on commit 8d63e5a

Please sign in to comment.