diff --git a/alma/common/install_gdrcopy.sh b/alma/common/install_gdrcopy.sh index 6cc2a15a..29609b37 100755 --- a/alma/common/install_gdrcopy.sh +++ b/alma/common/install_gdrcopy.sh @@ -6,20 +6,17 @@ source ${COMMON_DIR}/utilities.sh # Install gdrcopy gdrcopy_metadata=$(get_component_config "gdrcopy") GDRCOPY_VERSION=$(jq -r '.version' <<< $gdrcopy_metadata) -GDRCOPY_SHA256=$(jq -r '.sha256' <<< $gdrcopy_metadata) +GDRCOPY_COMMIT=$(jq -r '.commit' <<< $gdrcopy_metadata) GDRCOPY_DISTRIBUTION=$(jq -r '.distribution' <<< $gdrcopy_metadata) -TARBALL="v${GDRCOPY_VERSION}.tar.gz" -GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} +git clone https://github.com/NVIDIA/gdrcopy.git +pushd gdrcopy/packages/ +git checkout ${GDRCOPY_COMMIT} -${COMMON_DIR}/download_and_verify.sh $GDRCOPY_DOWNLOAD_URL $GDRCOPY_SHA256 -tar -xvf $TARBALL - -pushd gdrcopy-${GDRCOPY_VERSION}/packages/ CUDA=/usr/local/cuda ./build-rpm-packages.sh -rpm -Uvh gdrcopy-kmod-${GDRCOPY_VERSION}-1dkms.${GDRCOPY_DISTRIBUTION}.noarch.rpm -rpm -Uvh gdrcopy-${GDRCOPY_VERSION}-1.${GDRCOPY_DISTRIBUTION}.x86_64.rpm -rpm -Uvh gdrcopy-devel-${GDRCOPY_VERSION}-1.${GDRCOPY_DISTRIBUTION}.noarch.rpm +rpm -Uvh gdrcopy-kmod-${GDRCOPY_VERSION}dkms.${GDRCOPY_DISTRIBUTION}.noarch.rpm +rpm -Uvh gdrcopy-${GDRCOPY_VERSION}.${GDRCOPY_DISTRIBUTION}.x86_64.rpm +rpm -Uvh gdrcopy-devel-${GDRCOPY_VERSION}.${GDRCOPY_DISTRIBUTION}.noarch.rpm sed -i "$ s/$/ gdrcopy*/" /etc/dnf/dnf.conf popd diff --git a/alma/common/install_mpis.sh b/alma/common/install_mpis.sh index 5e7b6aad..3d9f848d 100755 --- a/alma/common/install_mpis.sh +++ b/alma/common/install_mpis.sh @@ -142,7 +142,7 @@ cat << EOF >> ${MPI_MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} conflict mpi module load ${GCC_VERSION} prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib +prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib:${HCOLL_PATH}/lib prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include diff --git a/tests/test-definitions.sh b/tests/test-definitions.sh index 328ea6f0..7e240d7f 100755 --- a/tests/test-definitions.sh +++ b/tests/test-definitions.sh @@ -25,6 +25,10 @@ function check_exit_code { fi } +function ver { + printf "%03d%03d%03d" $(echo "$1" | tr '.' ' '); +} + # verify OFED installation function verify_ofed_installation { # verify OFED installation @@ -99,8 +103,8 @@ function verify_ompi_installation { function verify_cuda_installation { # Verify NVIDIA Driver installation - nvidia-smi - check_exit_code "Nvidia Driver ${VERSION_NVIDIA}" "Failed to run Nvidia SMI" + nvidia_driver_cuda_version=$(nvidia-smi --version | tail -n 1 | awk -F':' '{print $2}' | tr -d "[:space:]") + check_exit_code "NVIDIA Driver ${VERSION_NVIDIA}" "Failed to run NVIDIA SMI" # Verify if NVIDIA peer memory module is inserted lsmod | grep nvidia_peermem @@ -112,6 +116,15 @@ function verify_cuda_installation { # check_exit_code "CUDA Driver ${VERSION_CUDA}" "CUDA not installed" check_exists "/usr/local/cuda/" + # Check that the CUDA runtime version isn't newer than the driver CUDA version. + # Having a newer CUDA runtime breaks gpu-burn + if [[ $(ver ${VERSION_CUDA}) -gt $(ver ${nvidia_driver_cuda_version}) ]]; then + echo "*** Error - CUDA runtime version ${VERSION_CUDA} is newer than the driver CUDA version ${nvidia_driver_cuda_version}" + exit -1 + else + echo "[OK] : CUDA runtime version ${VERSION_CUDA} is compatible with the driver CUDA version ${nvidia_driver_cuda_version}" + fi + # Verify the compilation of CUDA samples /usr/local/cuda/samples/0_Introduction/mergeSort/mergeSort check_exit_code "CUDA Samples ${VERSION_CUDA}" "Failed to perform merge sort using CUDA Samples" diff --git a/ubuntu/common/install_gdrcopy.sh b/ubuntu/common/install_gdrcopy.sh index 4964ee72..2718f45b 100755 --- a/ubuntu/common/install_gdrcopy.sh +++ b/ubuntu/common/install_gdrcopy.sh @@ -8,19 +8,16 @@ apt install -y build-essential devscripts debhelper check libsubunit-dev fakeroo gdrcopy_metadata=$(get_component_config "gdrcopy") GDRCOPY_VERSION=$(jq -r '.version' <<< $gdrcopy_metadata) -GDRCOPY_SHA256=$(jq -r '.sha256' <<< $gdrcopy_metadata) +GDRCOPY_COMMIT=$(jq -r '.commit' <<< $gdrcopy_metadata) GDRCOPY_DISTRIBUTION=$(jq -r '.distribution' <<< $gdrcopy_metadata) cuda_metadata=$(get_component_config "cuda") CUDA_DRIVER_VERSION=$(jq -r '.driver.version' <<< $cuda_metadata) -TARBALL="v${GDRCOPY_VERSION}.tar.gz" -GDRCOPY_DOWNLOAD_URL=https://github.com/NVIDIA/gdrcopy/archive/refs/tags/${TARBALL} +git clone https://github.com/NVIDIA/gdrcopy.git +pushd gdrcopy/packages/ +git checkout ${GDRCOPY_COMMIT} -${COMMON_DIR}/download_and_verify.sh $GDRCOPY_DOWNLOAD_URL $GDRCOPY_SHA256 -tar -xvf $TARBALL - -pushd gdrcopy-${GDRCOPY_VERSION}/packages/ CUDA=/usr/local/cuda ./build-deb-packages.sh dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}_amd64.${GDRCOPY_DISTRIBUTION}.deb apt-mark hold gdrdrv-dkms diff --git a/ubuntu/common/install_mpis.sh b/ubuntu/common/install_mpis.sh index 47888b1f..a846bec1 100755 --- a/ubuntu/common/install_mpis.sh +++ b/ubuntu/common/install_mpis.sh @@ -133,7 +133,7 @@ cat << EOF >> ${MPI_MODULE_FILES_DIRECTORY}/openmpi-${OMPI_VERSION} # conflict mpi prepend-path PATH /opt/openmpi-${OMPI_VERSION}/bin -prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib +prepend-path LD_LIBRARY_PATH /opt/openmpi-${OMPI_VERSION}/lib:${HCOLL_PATH}/lib prepend-path MANPATH /opt/openmpi-${OMPI_VERSION}/share/man setenv MPI_BIN /opt/openmpi-${OMPI_VERSION}/bin setenv MPI_INCLUDE /opt/openmpi-${OMPI_VERSION}/include diff --git a/versions.json b/versions.json index 6a79b123..9b1a978a 100644 --- a/versions.json +++ b/versions.json @@ -73,46 +73,46 @@ "nvidia": { "ubuntu20.04": { "driver": { - "version": "550.90.07", - "sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733" + "version": "560.35.03", + "sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3" }, "fabricmanager": { "distribution": "ubuntu2004", - "version": "550.90.07-1", - "sha256": "f8321f47875fa79968cda38c77c1723ff9fa3ca010f250f17b0075d84f443058" + "version": "560.35.03-1", + "sha256": "ad68065a83e2d3a5b3c3b8121fdc146c2aea5cdcca77b1ab4f9a64ac932be966" } }, "ubuntu22.04": { "driver": { - "version": "550.90.07", - "sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733" + "version": "560.35.03", + "sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3" }, "fabricmanager": { "distribution": "ubuntu2204", - "version": "550.90.07-1", - "sha256": "f8321f47875fa79968cda38c77c1723ff9fa3ca010f250f17b0075d84f443058" + "version": "560.35.03-1", + "sha256": "ad68065a83e2d3a5b3c3b8121fdc146c2aea5cdcca77b1ab4f9a64ac932be966" } }, "almalinux8.7": { "driver": { - "version": "550.90.07", - "sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733" + "version": "560.35.03", + "sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3" }, "fabricmanager": { "distribution": "rhel8", - "version": "550.90.07-1", - "sha256": "e580c8b412de7a9f352c2eacce877b4cbccd494bfdee7a0e6c94010a06c91c92" + "version": "560.35.03-1", + "sha256": "881a686f3214fbad464404f6c71dace656cdca841e3abdfd1b04a8e310e797ad" } }, "almalinux8.10": { "driver": { - "version": "550.90.07", - "sha256": "51acf579d5a9884f573a1d3f522e7fafa5e7841e22a9cec0b4bbeae31b0b9733" + "version": "560.35.03", + "sha256": "f2932c92fadd43c5b2341be453fc4f73f0ad7185c26bb7a43fbde81ae29f1fe3" }, "fabricmanager": { "distribution": "rhel8", - "version": "550.90.07-1", - "sha256": "e580c8b412de7a9f352c2eacce877b4cbccd494bfdee7a0e6c94010a06c91c92" + "version": "560.35.03-1", + "sha256": "881a686f3214fbad464404f6c71dace656cdca841e3abdfd1b04a8e310e797ad" } } }, @@ -160,23 +160,23 @@ }, "gdrcopy": { "ubuntu20.04": { - "version": "2.4.1", - "sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed", + "version": "2.5-1", + "commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532", "distribution": "Ubuntu20_04" }, "ubuntu22.04": { - "version": "2.4.1", - "sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed", + "version": "2.5-1", + "commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532", "distribution": "Ubuntu22_04" }, "almalinux8.7": { - "version": "2.4.1", - "sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed", + "version": "2.5-1", + "commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532", "distribution": "el8" }, "almalinux8.10": { - "version": "2.4.1", - "sha256": "faa7e816e9bad3301e53d6721457f7ef5ab42b7aa3b01ffda51f8e5620bb20ed", + "version": "2.5-1", + "commit": "1366e20d140c5638fcaa6c72b373ac69f7ab2532", "distribution": "el8" } },