From fed257d6ceb5ab83ad4587dec7302e55fbbb077a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 14 Feb 2024 12:55:53 -0800 Subject: [PATCH 001/169] Adding some flexibility in the customized_build_env script to make the location of the external superbuild dependencies easily relocatable. --- scripts/customize_build_env.sh | 27 +++++++----- scripts/superbuild_externals.sh | 77 +++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index cc958770f3a..58d9df1d1b4 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -72,7 +72,9 @@ set_center_specific_gpu_arch() ;; "zen3") # Tioga, RZVernal # Use a HIP Clang variant - GPU_ARCH_VARIANTS="amdgpu_target=gfx90a" +# GPU_ARCH_VARIANTS="amdgpu_target=gfx90a" + GPU_ARCH_VARIANTS="amdgpu_target=gfx90a,gfx942" +# GPU_ARCH_VARIANTS="amdgpu_target=gfx940" ;; *) ;; @@ -124,7 +126,8 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3") # Tioga, RZVernal - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.09.0 cce/17.0.0 craype/2.7.23 cray-mpich/8.1.28 cray-libsci/23.09.1.1 PrgEnv-cray StdEnv rocm/5.7.1 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.0 craype/2.7.30 cray-mpich/8.1.28 cray-libsci/23.12.5 PrgEnv-cray StdEnv rocm/5.7.1 cmake/3.24.2" +# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.09.0 cce/16.0.1 craype/2.7.23 cray-mpich/8.1.27 cray-libsci/23.09.1.1 PrgEnv-cray/8.4.0 StdEnv rocm/5.7.1 cmake/3.24.2" ;; *) echo "No pre-specified modules found for this system. Make sure to setup your own" @@ -299,6 +302,7 @@ set_center_specific_externals() local module_dir="$5" if [[ ${center} = "llnl_lc" ]]; then + prefix="/p/vast1/lbann/stable_dependencies" case ${spack_arch_target} in "broadwell" | "haswell" | "sandybridge" | "ivybridge") cat <> ${yaml} @@ -319,8 +323,8 @@ cat <> ${yaml} modules: - mvapich2/2.3.7 EOF - set_superbuild_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" - set_superbuild_DHA_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" + set_superbuild_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -333,9 +337,9 @@ cat <> ${yaml} - spec: rdma-core@20 arch=${spack_arch} prefix: /usr EOF - set_superbuild_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" - set_superbuild_DHA_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" - set_superbuild_power_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" + set_superbuild_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + set_superbuild_power_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" ;; "zen" | "zen2") @@ -373,11 +377,12 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" - set_superbuild_DHA_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" + set_superbuild_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" ;; "zen3") + prefix="/usr/workspace/lbann/stable_dependencies" cat <> ${yaml} compilers: - compiler: @@ -462,8 +467,8 @@ cat <> ${yaml} modules: - cce/17.0.0 PrgEnv-cray cray-mpich/8.1.28 EOF - set_superbuild_externals "tioga" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" - set_superbuild_DHA_externals "tioga" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" + set_superbuild_externals "rzvernal" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" + set_superbuild_DHA_externals "rzvernal" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; *) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index dd48c635c9b..d592af5b3bd 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -5,8 +5,19 @@ set_superbuild_externals() local mpi="$3" local yaml="$4" local LOG="$5" + local prefix="$6" + local gpu_arch="$7" + + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi - CMD="source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + #/usr/workspace/lbann/stable_dependencies/rzvernal/rocm-5.7.1/mi300a/cray-mpich-8.1.27 + + #/usr/workspace/lbann/lbann-superbuild/ + + CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" +# CMD="source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" echo ${CMD} | tee -a ${LOG} ${CMD} @@ -17,123 +28,123 @@ cat <> ${yaml} - 'master' externals: - spec: adiak@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/adiak + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/adiak caliper: buildable: false version: - 'master' externals: - spec: caliper@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/caliper + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/caliper catch2: buildable: false version: - '2.9.2' externals: - spec: catch2@2.9.2 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/catch2 + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/catch2 hdf5: buildable: false version: - '1.10.9' externals: - spec: hdf5@1.10.9 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hdf5 + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hdf5 jpeg-turbo: buildable: false version: - '2.0.3' externals: - spec: jpeg-turbo@2.0.3 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/jpeg-turbo + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/jpeg-turbo spdlog: buildable: false version: - '1.12.0' externals: - spec: spdlog@1.12.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/spdlog + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/spdlog cereal: buildable: false version: - '1.3.0' externals: - spec: cereal@1.3.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/cereal + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/cereal clara: buildable: false version: - '1.1.5' externals: - spec: clara@1.1.5 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/clara + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/clara cnpy: buildable: false version: - 'master' externals: - spec: cnpy@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/cnpy + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/cnpy conduit: buildable: false version: - 'develop' externals: - spec: conduit@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/conduit + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/conduit hiptt: buildable: false version: - 'master' externals: - spec: hiptt@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hiptt + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hiptt opencv: buildable: false version: - '4.1.0' externals: - spec: opencv@4.1.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/opencv + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/opencv protobuf: buildable: false version: - '3.21.5' externals: - - spec: protobuf@3.21.5 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/protobuf + - spec: protobuf@3.21.5~shared arch=${spack_arch} + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/protobuf zstr: buildable: false version: - 'master' externals: - spec: zstr@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/zstr + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/zstr nccl: buildable: false version: - '2.19.4' externals: - spec: nccl@2.19.4 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/nccl + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/nccl cudnn: buildable: false version: - '8.9.4' externals: - spec: cudnn@8.9.4 arch=linux-rhel8-broadwell - prefix: /p/vast1/lbann/stable_dependencies/cudnn-8.9.4/cuda_11_x86_64 + prefix: ${prefix}/cudnn-8.9.4/cuda_11_x86_64 - spec: cudnn@8.9.4 arch=linux-rhel7-power9le - prefix: /p/vast1/lbann/stable_dependencies/cudnn-8.9.4/cuda_11_ppc64le + prefix: ${prefix}/cudnn-8.9.4/cuda_11_ppc64le cutensor: buildable: false version: - '1.7.0.1' externals: - spec: cutensor@1.7.0.1 arch=linux-rhel8-broadwell - prefix: /p/vast1/lbann/stable_dependencies/cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive + prefix: ${prefix}/cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive - spec: cutensor@1.7.0.1 arch=linux-rhel7-power9le - prefix: /p/vast1/lbann/stable_dependencies/cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive + prefix: ${prefix}/cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive EOF } @@ -144,8 +155,14 @@ set_superbuild_DHA_externals() local dnn_lib="$2" local mpi="$3" local yaml="$4" + local prefix="$5" + local gpu_arch="$6" - source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi + +# source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh cat <> ${yaml} aluminum: @@ -154,21 +171,21 @@ cat <> ${yaml} - 'master' externals: - spec: aluminum@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/aluminum + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aluminum hydrogen: buildable: false version: - 'develop' externals: - spec: hydrogen@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hydrogen + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hydrogen dihydrogen: buildable: false version: - 'develop' externals: - spec: dihydrogen@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/dihydrogen + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/dihydrogen EOF } @@ -178,8 +195,14 @@ set_superbuild_power_externals() local dnn_lib="$2" local mpi="$3" local yaml="$4" + local prefix="$5" + local gpu_arch="$6" + + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi - source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh +# source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh cat <> ${yaml} openblas: @@ -188,6 +211,6 @@ cat <> ${yaml} - '0.3.6' externals: - spec: openblas@0.3.6 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/openblas + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/openblas EOF } From 81a664af545fd25bcea9b19b038c5fe980fc4293 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 15 Feb 2024 13:39:55 -0800 Subject: [PATCH 002/169] Adding code to explicitly get the hostname for the superbuild configuration. --- scripts/customize_build_env.sh | 25 ++++++++++++++----------- scripts/utilities.sh | 7 +++++++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 58d9df1d1b4..30034d514f4 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -303,6 +303,7 @@ set_center_specific_externals() if [[ ${center} = "llnl_lc" ]]; then prefix="/p/vast1/lbann/stable_dependencies" + host=$(host_basename) case ${spack_arch_target} in "broadwell" | "haswell" | "sandybridge" | "ivybridge") cat <> ${yaml} @@ -323,8 +324,8 @@ cat <> ${yaml} modules: - mvapich2/2.3.7 EOF - set_superbuild_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -337,9 +338,9 @@ cat <> ${yaml} - spec: rdma-core@20 arch=${spack_arch} prefix: /usr EOF - set_superbuild_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" - set_superbuild_power_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" ;; "zen" | "zen2") @@ -377,12 +378,15 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" ;; "zen3") - prefix="/usr/workspace/lbann/stable_dependencies" + if [ ${host} == "rzvernal" ]; then + # Override the prefix path for this system + prefix="/usr/workspace/lbann/stable_dependencies" + fi cat <> ${yaml} compilers: - compiler: @@ -467,9 +471,8 @@ cat <> ${yaml} modules: - cce/17.0.0 PrgEnv-cray cray-mpich/8.1.28 EOF - set_superbuild_externals "rzvernal" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" - set_superbuild_DHA_externals "rzvernal" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" - + set_superbuild_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; *) echo "No center-specified externals." diff --git a/scripts/utilities.sh b/scripts/utilities.sh index 97445aeb541..344a9547267 100644 --- a/scripts/utilities.sh +++ b/scripts/utilities.sh @@ -17,6 +17,13 @@ osx_realpath() { [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}" } +host_basename() { + HOST=$(hostname) + HOST=${HOST//[[:digit:]]/} + HOST=${HOST//\-/} + echo ${HOST} +} + function normpath() { # Remove all /./ sequences. local path=${1//\/.\//\/} From 949bffc3692b615bbc0a84e0b572be52c1c987c2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 9 Apr 2024 16:19:39 -0700 Subject: [PATCH 003/169] Updated to the latest ROCm versions. --- scripts/customize_build_env.sh | 72 ++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 30034d514f4..f175a908b3f 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -126,8 +126,8 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3") # Tioga, RZVernal - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.0 craype/2.7.30 cray-mpich/8.1.28 cray-libsci/23.12.5 PrgEnv-cray StdEnv rocm/5.7.1 cmake/3.24.2" -# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.09.0 cce/16.0.1 craype/2.7.23 cray-mpich/8.1.27 cray-libsci/23.09.1.1 PrgEnv-cray/8.4.0 StdEnv rocm/5.7.1 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.0.3 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.0.3 cmake/3.24.2" +# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; *) echo "No pre-specified modules found for this system. Make sure to setup your own" @@ -219,8 +219,9 @@ set_center_specific_spack_dependencies() CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; "zen3") # Tioga, RZVernal - CENTER_COMPILER="%rocmcc@5.7.1" - CENTER_DEPENDENCIES="^cray-mpich@8.1.28 ^hip@5.7.1 ^python@3.9.12" +# CENTER_COMPILER="%cce@17.0.1" + CENTER_COMPILER="%rocmcc@6.0.3" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.0.3 ^python@3.9.12" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" @@ -390,7 +391,7 @@ EOF cat <> ${yaml} compilers: - compiler: - spec: rocmcc@5.7.1 + spec: rocmcc@6.0.3 paths: cc: amdclang cxx: amdclang++ @@ -402,15 +403,15 @@ cat <> ${yaml} operating_system: rhel8 target: any modules: - - PrgEnv-cray/8.4.0 - - cce/17.0.0 - - rocm/5.7.1 + - PrgEnv-amd + - amd/6.0.3 + - rocm/6.0.3 environment: {} - extra_rpaths: - - /opt/cray/pe/cce/17.0.0/cce/x86_64/lib - - /opt/cray/pe/cce/17.0.0/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu + # extra_rpaths: + # - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib + # - /opt/cray/pe/cce/17.0.1/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu - compiler: - spec: cce@17.0.0 + spec: cce@17.0.1 paths: cc: craycc cxx: crayCC @@ -421,12 +422,12 @@ cat <> ${yaml} target: any modules: - PrgEnv-cray - - cce/17.0.0 - - rocm/5.7.1 + - cce/17.0.1 + - rocm/6.0.3 environment: {} extra_rpaths: - - /opt/cray/pe/cce/17.0.0/cce/x86_64/lib - - /opt/cray/pe/cce/17.0.0/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu + - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib + - /opt/cray/pe/cce/17.0.1/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu packages: all: require: @@ -436,43 +437,48 @@ cat <> ${yaml} hipcub: buildable: false version: - - '5.7.1' + - '6.0.3' externals: - - spec: hipcub@5.7.1 arch=${spack_arch} - prefix: /opt/rocm-5.7.1/hipcub + - spec: hipcub@6.0.3 arch=${spack_arch} + prefix: /opt/rocm-6.0.3/hipcub rocthrust: buildable: false version: - - '5.7.1' + - '6.0.3' externals: - - spec: rocthrust@5.7.1 arch=${spack_arch} - prefix: /opt/rocm-5.7.1 + - spec: rocthrust@6.0.3 arch=${spack_arch} + prefix: /opt/rocm-6.0.3 llvm-amdgpu: buildable: false version: - - '5.7.1' + - '6.0.3' externals: - - spec: llvm-amdgpu@5.7.1 arch=${spack_arch} - prefix: /opt/rocm-5.7.1/llvm + - spec: llvm-amdgpu@6.0.3 arch=${spack_arch} + prefix: /opt/rocm-6.0.3/llvm cray-libsci: buildable: false version: - - '23.09.1.1' + - '24.03.0' externals: - - spec: cray-libsci@23.09.1.1 %rocmcc arch=${spack_arch} + - spec: cray-libsci@24.03.0 %rocmcc arch=${spack_arch} modules: - - cce/17.0.0 PrgEnv-cray cray-libsci/23.09.1.1 +# - amd/6.0.3 PrgEnv-amd cray-libsci/24.03.0 +# - amd/6.0.3 PrgEnv-amd cray-libsci/23.09.1.1 + - cce/17.0.1 PrgEnv-cray cray-libsci/24.03.0 cray-mpich: buildable: false version: - - '8.1.28' + - '8.1.29' externals: - - spec: cray-mpich@8.1.28 %rocmcc arch=${spack_arch} + - spec: cray-mpich@8.1.29 %rocmcc arch=${spack_arch} modules: - - cce/17.0.0 PrgEnv-cray cray-mpich/8.1.28 +# - amd/6.0.3 PrgEnv-amd cray-mpich/8.1.29 + - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF - set_superbuild_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" - set_superbuild_DHA_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" + set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${prefix}" + # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" + # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; *) echo "No center-specified externals." From 60cb632855709a587f84a4ea811f6525d206e9d7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 9 Apr 2024 16:20:33 -0700 Subject: [PATCH 004/169] Added some env variables for RCCL --- python/lbann/contrib/lc/launcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index 80ead930727..fd0b2ea367d 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -96,6 +96,8 @@ def prepend_environment_path(key, prefix): # Optimizations for Tioga if system in ('tioga', 'rzvernal'): #set_environment('NCCL_SOCKET_IFNAME', 'hsi') + set_environment('NCCL_NET_GDR_LEVEL', '2') # From HPE to avoid hangs +# set_environment('FI_CXI_ATS', '0') # From Adam Moody Dist PyTorch on Tioga set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '0') set_environment('MIOPEN_DISABLE_CACHE', '0') tmpdir = os.environ.get('TMPDIR') From 045c7b063fcf8e07f64bd18332453dc953c13e22 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 17 Apr 2024 09:13:29 -0700 Subject: [PATCH 005/169] Add spack type for mi300a --- scripts/customize_build_env.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index f175a908b3f..e2d69a78484 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -70,7 +70,7 @@ set_center_specific_gpu_arch() # Use a HIP Clang variant GPU_ARCH_VARIANTS="amdgpu_target=gfx906" ;; - "zen3") # Tioga, RZVernal + "zen3" | "zen4") # Tioga, RZVernal, RZAdams # Use a HIP Clang variant # GPU_ARCH_VARIANTS="amdgpu_target=gfx90a" GPU_ARCH_VARIANTS="amdgpu_target=gfx90a,gfx942" @@ -125,7 +125,7 @@ set_center_specific_modules() MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; - "zen3") # Tioga, RZVernal + "zen3" | "zen4") # Tioga, RZVernal, RZAdams MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.0.3 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.0.3 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; @@ -218,7 +218,7 @@ set_center_specific_spack_dependencies() CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@5.7.0 ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; - "zen3") # Tioga, RZVernal + "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" CENTER_COMPILER="%rocmcc@6.0.3" CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.0.3 ^python@3.9.12" @@ -383,8 +383,8 @@ EOF set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" ;; - "zen3") - if [ ${host} == "rzvernal" ]; then + "zen3" | "zen4") + if [[ ${host} == "rzvernal" || ${host} =~ "rzadams" ]]; then # Override the prefix path for this system prefix="/usr/workspace/lbann/stable_dependencies" fi @@ -722,7 +722,7 @@ set_center_specific_variants() "ivybridge") # Catalyst CENTER_USER_VARIANTS="+onednn" ;; - "zen" | "zen2") # Corona + "zen" | "zen2" | "zen3" | "zen4") # Corona CENTER_USER_VARIANTS="+rocm" ;; *) From d38da1dfd44446b2b3e696a45f01a9e9cbdd10e1 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 29 Apr 2024 11:38:15 -0700 Subject: [PATCH 006/169] Only include the external CUDA libraries on cuda systems. --- scripts/superbuild_externals.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index d592af5b3bd..8f520c7e3ab 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -120,6 +120,10 @@ cat <> ${yaml} externals: - spec: zstr@master arch=${spack_arch} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/zstr +EOF + + if [[ ${dnn_lib} =~ "cuda" ]]; then +cat <> ${yaml} nccl: buildable: false version: @@ -147,6 +151,7 @@ cat <> ${yaml} prefix: ${prefix}/cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive EOF + fi } set_superbuild_DHA_externals() From 235325193afc5fb2b752cf9d679afd5fe02b69e9 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 29 Apr 2024 11:38:48 -0700 Subject: [PATCH 007/169] Fixed the external modules for cray-mpich. --- scripts/customize_build_env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index e2d69a78484..2ed4d52376e 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -472,8 +472,8 @@ cat <> ${yaml} externals: - spec: cray-mpich@8.1.29 %rocmcc arch=${spack_arch} modules: -# - amd/6.0.3 PrgEnv-amd cray-mpich/8.1.29 - - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 + - amd/6.0.3 PrgEnv-amd cray-mpich/8.1.29 +# - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${prefix}" From 0692c03dc2a20bd1a85e2b2df97c5c4fa55821b3 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 11:27:41 -0700 Subject: [PATCH 008/169] Ensure that the CMAKE_PREFIX_PATH is captured in the superbuild suggested prefix path. Fixed bug where the forwarded CMAKE_PREFIX_PATH was overwritten when a package dependend on other packages. --- scripts/superbuild/CMakeLists.txt | 2 +- .../cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 463d386c447..65757bf68a1 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -115,7 +115,7 @@ foreach (pkg ${LBANN_SB_BUILD_PKGS}) endforeach () # Print a helpful(?) message -set(LBANN_SB_SUGG_CMAKE_PREFIX_PATH_TMP "\$\{CMAKE_PREFIX_PATH\}") +set(LBANN_SB_SUGG_CMAKE_PREFIX_PATH_TMP "${CMAKE_PREFIX_PATH}" "\$\{CMAKE_PREFIX_PATH\}") message("\n-----------------------------------------------------------------\n") message("LBANN SuperBuild will build the following packages:\n") foreach (pkg ${LBANN_SB_BUILD_PKGS}) diff --git a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake index 5794aacada0..2af0a86f87a 100644 --- a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake +++ b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake @@ -228,6 +228,7 @@ macro(lbann_sb_add_cmake_extern_pkg) set(LBANN_SB_DEPENDS_TAG "DEPENDS") string(REPLACE ";" "|" LBANN_SB_FWD_${PKG_NAME}_CMAKE_PREFIX_PATH + "${LBANN_SB_FWD_${PKG_NAME}_CMAKE_PREFIX_PATH};" "${LBANN_SB_${PKG_NAME}_DEPENDS_PATHS}") message(STATUS "${PKG_NAME} depends on: ${LBANN_SB_${PKG_NAME}_DEPENDS}") endif () From be22132ac23b17f01571f5e575f87626548c3c59 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 12:48:02 -0700 Subject: [PATCH 009/169] Automatically output the suggested cmake prefix path to the install directory. --- scripts/superbuild/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 65757bf68a1..f2ecb515af0 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -142,6 +142,8 @@ message("Note that these assume a Bourne-compatible shell.") message("\n-----------------------------------------------------------------\n") file(WRITE "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" "export CMAKE_PREFIX_PATH=${LBANN_SB_SUGG_CMAKE_PREFIX_PATH}\n") +file(WRITE "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export CMAKE_PREFIX_PATH=${LBANN_SB_SUGG_CMAKE_PREFIX_PATH}\n") if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("-----------------------------------------------------------------\n") From edd26590eb7e92e66989c1bffbefdb7de860505f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 13:57:26 -0700 Subject: [PATCH 010/169] Forwarded the CMAKE_PREFIX_PATH to the LBANN build. --- scripts/superbuild/ci/ci_pascal.sh | 152 +++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100755 scripts/superbuild/ci/ci_pascal.sh diff --git a/scripts/superbuild/ci/ci_pascal.sh b/scripts/superbuild/ci/ci_pascal.sh new file mode 100755 index 00000000000..fcf8473e684 --- /dev/null +++ b/scripts/superbuild/ci/ci_pascal.sh @@ -0,0 +1,152 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=ON + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" + +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" + +# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) +CUDA_GPU_ARCH=60 + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Set to the preferred install directory +#INSTALL_PREFIX=${PWD}/install-rocm-distconv +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 +INSTALL_PREFIX=${INSTALL_ROOT}/openmpi-4.1.2 + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild + +# Location of external packages +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which gcc) \ + -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ + \ + -D LBANN_SB_BUILD_adiak=OFF \ + -D LBANN_SB_BUILD_Caliper=OFF \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_TAG=4.x \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=OFF \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ + -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ + -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_LBANN_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ + -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From 6cc61aa697a10544740506778e18e403a9b79abf Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 13:58:30 -0700 Subject: [PATCH 011/169] Added a flag to the build_lbann.sh script to specify a directory of superbuilt external libraries. --- scripts/build_lbann.sh | 14 +++++++++++++- scripts/customize_build_env.sh | 16 +++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index df9d091a356..2cc8cdaf511 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -54,6 +54,8 @@ ALUMINUM_VER="@master" DIHYDROGEN_VER="@develop" # Default variants for Conduit to minimize dependencies CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" +# User specified location for externals built with the superbuild +LBANN_SUPERBUILD_EXTERNAL_DIR="" ################################################################ # Help message @@ -98,6 +100,7 @@ Options: ${C}--tmp-build-dir${N} Put the build directory in tmp space ${C}--spec-only${N} Stop after a spack spec command ${C}-s | --stable${N} Use the latest stable defaults not the head of Hydrogen, DiHydrogen and Aluminum repos + ${C}--superbuild-prefix${N} Use the latest stable defaults not the head of Hydrogen, DiHydrogen and Aluminum repos ${C}--hydrogen-repo ${N} Use a local repository for the Hydrogen library ${C}--dihydrogen-repo ${N} Use a local repository for the DiHydrogen library ${C}--aluminum-repo ${N} Use a local repository for the Aluminum library @@ -233,6 +236,15 @@ while :; do ALUMINUM_VER="@1.0.0-lbann" DIHYDROGEN_VER= ;; + --superbuild-prefix) + if [ -n "${2}" ]; then + LBANN_SUPERBUILD_EXTERNAL_DIR=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + exit 1 + fi + ;; --hydrogen-repo) if [ -n "${2}" ]; then HYDROGEN_PATH=${2} @@ -777,7 +789,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then # See if there are any center-specific externals SPACK_ENV_YAML_FILE="${SPACK_ROOT}/var/spack/environments/${LBANN_ENV}/spack.yaml" - CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR}" + CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DIR}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 2ed4d52376e..59ec08b359b 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -114,7 +114,8 @@ set_center_specific_modules() ;; "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface - MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/10.3.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" + MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/11.2.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" +# MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/10.3.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" # Note that clang is installed in /usr/workspace/brain/tom/pascal/llvm/latest/ and it is version 17.0.0 MODULE_CMD_CLANG="module load gcc/10.3.1 cuda/11.8.0 mvapich2/2.3.7 python/3.9.12" ;; @@ -199,7 +200,7 @@ set_center_specific_spack_dependencies() ;; "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-10.3.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" + CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-11.2.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" CENTER_COMPILER="%gcc" # CENTER_COMPILER="%clang" # DEPENDENTS_CENTER_COMPILER="%gcc@10.3.1" @@ -301,9 +302,12 @@ set_center_specific_externals() local spack_arch="$3" local yaml="$4" local module_dir="$5" + local prefix="$6" if [[ ${center} = "llnl_lc" ]]; then - prefix="/p/vast1/lbann/stable_dependencies" + if [[ -z ${prefix} ]]; then + prefix="/p/vast1/lbann/stable_dependencies" + fi host=$(host_basename) case ${spack_arch_target} in "broadwell" | "haswell" | "sandybridge" | "ivybridge") @@ -385,8 +389,10 @@ EOF ;; "zen3" | "zen4") if [[ ${host} == "rzvernal" || ${host} =~ "rzadams" ]]; then - # Override the prefix path for this system - prefix="/usr/workspace/lbann/stable_dependencies" + if [[ -z ${prefix} ]]; then + # Override the prefix path for this system + prefix="/usr/workspace/lbann/stable_dependencies" + fi fi cat <> ${yaml} compilers: From 3fc2169e56203568f7b3f303e4dd0446054c11d9 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 13:59:57 -0700 Subject: [PATCH 012/169] Added the superbuild-prefix to the Pascal CI pipeline. --- .gitlab/pascal/pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 1a8d0e85d8d..f88e90b715e 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -70,6 +70,7 @@ build and install: - !reference [.setup_spack, script] - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} - export TEST_TASKS_PER_NODE=2 From 5e345f1a4a9c67bd37f04883e87a459bcca77afe Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 14:18:03 -0700 Subject: [PATCH 013/169] Disable caliper and force gcc@11.2.1 --- .gitlab/pascal/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index f88e90b715e..c9cbd0e393d 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -71,8 +71,8 @@ build and install: - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies - --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + --ci-pip -- %gcc@11.2.1 + +deterministic +vision +numpy +unit_tests ~caliper ${SPACK_SPECS} - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) From a07f99d63293ad252f5b31e1dd2273b2f3387c50 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 18:31:54 -0700 Subject: [PATCH 014/169] Switch back to using the system specific spack. --- .gitlab/pascal/pipeline.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index c9cbd0e393d..cf6fffadb08 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -178,10 +178,10 @@ release allocation: variables: # Just the obvious identifier. Which specific node doesn't matter. SYSTEM_NAME: pascal - # SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} - # SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git - SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test - SPACK_REPO: spack_repos/spack_test.git + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git # These are system-specific specs that should be forwarded to the # build script From 8906a6741670ed316a639c58c5153b33b0dbc61d Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 18:44:11 -0700 Subject: [PATCH 015/169] Force the use of normal zlib --- .gitlab/pascal/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index cf6fffadb08..7ecdd597a98 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -72,7 +72,7 @@ build and install: -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- %gcc@11.2.1 - +deterministic +vision +numpy +unit_tests ~caliper ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ~caliper ^zlib ${SPACK_SPECS} - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) From 24615f47e4a42f90ad516ea872b31b5ab4c6f527 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 19:00:31 -0700 Subject: [PATCH 016/169] Force the use of normal zlib --- .gitlab/pascal/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 7ecdd597a98..744503b9493 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -72,7 +72,7 @@ build and install: -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- %gcc@11.2.1 - +deterministic +vision +numpy +unit_tests ~caliper ^zlib ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ~caliper ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) From ce70053871bdc58cba111508edc1b594e586341e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 12 Jun 2024 23:12:06 -0700 Subject: [PATCH 017/169] Temporarily disable half on Pascal. --- .gitlab/pascal/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 744503b9493..214882e40e3 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -185,7 +185,8 @@ release allocation: # These are system-specific specs that should be forwarded to the # build script - SPACK_SPECS: "+cuda +half +fft" + SPACK_SPECS: "+cuda +fft" +# SPACK_SPECS: "+cuda +half +fft" tags: - pascal From 9f71345b483782c324d3805034c7be19f625e315 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2024 09:59:04 -0700 Subject: [PATCH 018/169] Split the superbuild scripts into core dependencies and DHA dependencies. --- .../ci/ci_pascal_core_dependencies.sh | 117 ++++++++++++++++++ ...ci_pascal.sh => ci_pascal_dha_distconv.sh} | 50 ++------ 2 files changed, 128 insertions(+), 39 deletions(-) create mode 100755 scripts/superbuild/ci/ci_pascal_core_dependencies.sh rename scripts/superbuild/ci/{ci_pascal.sh => ci_pascal_dha_distconv.sh} (72%) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh new file mode 100755 index 00000000000..06686629cb1 --- /dev/null +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -0,0 +1,117 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# # Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +# BUILD_LBANN_STACK=ON + +# # Set to ON to enable DistConv support. Only matters if building the +# # LBANN stack. +# BUILD_WITH_DISTCONV=ON + +# Improve debugging info and remove some misguided warnings. These are +# # passed only to the LBANN stack. +# EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +# EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" + +# # Prefer RPATH to RUNPATH (stability over flexibility) +# EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" + +# # Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) +# CUDA_GPU_ARCH=60 + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh + +# Set to the preferred install directory +#INSTALL_PREFIX=${PWD}/install-rocm-distconv +# INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies + +# Location of external packages +# export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 +# FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which gcc) \ + -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ + \ + -D LBANN_SB_BUILD_adiak=OFF \ + -D LBANN_SB_BUILD_Caliper=OFF \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_TAG=4.x diff --git a/scripts/superbuild/ci/ci_pascal.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh similarity index 72% rename from scripts/superbuild/ci/ci_pascal.sh rename to scripts/superbuild/ci/ci_pascal_dha_distconv.sh index fcf8473e684..60ce54732ee 100755 --- a/scripts/superbuild/ci/ci_pascal.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -52,16 +52,21 @@ LBANN_SRC_DIR=$(git rev-parse --show-toplevel) # Set to the directory with the top-level SuperBuild CMakeLists.txt file SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh + # Set to the preferred install directory #INSTALL_PREFIX=${PWD}/install-rocm-distconv -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 -INSTALL_PREFIX=${INSTALL_ROOT}/openmpi-4.1.2 +#INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv + -# Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +# export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} #export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} @@ -93,26 +98,6 @@ cmake \ -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ \ - -D LBANN_SB_BUILD_adiak=OFF \ - -D LBANN_SB_BUILD_Caliper=OFF \ - -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ - \ - -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_OpenCV_TAG=4.x \ - \ -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ @@ -136,17 +121,4 @@ cmake \ -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ - -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ - -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_LBANN_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ - -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From 472d426c27c63254c3e36b0d1b534e4e201c5a60 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2024 10:22:16 -0700 Subject: [PATCH 019/169] Added superbuild script for DHA with half. --- .../ci/ci_pascal_core_dependencies.sh | 24 ---- .../superbuild/ci/ci_pascal_dha_distconv.sh | 25 +--- scripts/superbuild/ci/ci_pascal_dha_half.sh | 108 ++++++++++++++++++ 3 files changed, 112 insertions(+), 45 deletions(-) create mode 100755 scripts/superbuild/ci/ci_pascal_dha_half.sh diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 06686629cb1..985e282133e 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -28,24 +28,6 @@ # dependencies of the LBANN stack BUILD_EXTERNAL_TPLS=ON -# # Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -# BUILD_LBANN_STACK=ON - -# # Set to ON to enable DistConv support. Only matters if building the -# # LBANN stack. -# BUILD_WITH_DISTCONV=ON - -# Improve debugging info and remove some misguided warnings. These are -# # passed only to the LBANN stack. -# EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -# EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" - -# # Prefer RPATH to RUNPATH (stability over flexibility) -# EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" - -# # Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) -# CUDA_GPU_ARCH=60 - # Set to the directory with the top-level CMakeLists.txt file for LBANN LBANN_SRC_DIR=$(git rev-parse --show-toplevel) @@ -56,17 +38,11 @@ SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh # Set to the preferred install directory -#INSTALL_PREFIX=${PWD}/install-rocm-distconv -# INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies -# Location of external packages -# export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 -# FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} - #export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} cmake \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index 60ce54732ee..a308c6e612b 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -24,10 +24,6 @@ ## permissions and limitations under the license. ################################################################################ -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - # Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN BUILD_LBANN_STACK=ON @@ -35,16 +31,9 @@ BUILD_LBANN_STACK=ON # LBANN stack. BUILD_WITH_DISTCONV=ON -# Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" - -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" - -# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) -CUDA_GPU_ARCH=60 +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF # Set to the directory with the top-level CMakeLists.txt file for LBANN LBANN_SRC_DIR=$(git rev-parse --show-toplevel) @@ -56,21 +45,15 @@ SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh # Set to the preferred install directory -#INSTALL_PREFIX=${PWD}/install-rocm-distconv -#INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv - # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -# export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - cmake \ -G Ninja \ -S ${SUPERBUILD_SRC_DIR} \ @@ -112,7 +95,7 @@ cmake \ -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=OFF \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh new file mode 100755 index 00000000000..3148232a0c6 --- /dev/null +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -0,0 +1,108 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=OFF + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_half + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-half + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which gcc) \ + -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From 5f7fc4fae4f8a57d6b48442f28c7cfd7f8d4e101 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2024 13:40:54 -0700 Subject: [PATCH 020/169] Updated the build scripts to allow for specific DHA compiled versions. --- scripts/build_lbann.sh | 18 ++++- scripts/customize_build_env.sh | 9 ++- scripts/superbuild/ci/ci_pascal_dha.sh | 108 +++++++++++++++++++++++++ scripts/superbuild_externals.sh | 9 ++- 4 files changed, 135 insertions(+), 9 deletions(-) create mode 100755 scripts/superbuild/ci/ci_pascal_dha.sh diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 2cc8cdaf511..d439dd3c1d6 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -507,6 +507,22 @@ if [[ "${LBANN_VARIANTS}" =~ (.*)(%[0-9a-zA-Z:\.@]+)(.*) ]]; then LBANN_VARIANTS="${BASH_REMATCH[1]} ${BASH_REMATCH[3]}" fi +# Here is a fairly brittle way to find the DiHydrogen, Hydrogen, and Aluminum superbuilds +LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="dha" +if [[ "${LBANN_VARIANTS}" =~ .*"+distconv".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_distconv" +fi +if [[ "${LBANN_VARIANTS}" =~ .*"+half".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_half" +fi +if [[ "${LBANN_VARIANTS}" =~ .*"+nvshmem".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_nvshmem" +fi + + if [[ "${CENTER_COMPILER}" =~ .*"%clang".* ]]; then # If the compiler is clang use the LLD fast linker CENTER_LINKER_FLAGS="+lld" @@ -789,7 +805,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then # See if there are any center-specific externals SPACK_ENV_YAML_FILE="${SPACK_ROOT}/var/spack/environments/${LBANN_ENV}/spack.yaml" - CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DIR}" + CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 59ec08b359b..37f3edd5179 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -303,6 +303,7 @@ set_center_specific_externals() local yaml="$4" local module_dir="$5" local prefix="$6" + local dha_dir="$7" if [[ ${center} = "llnl_lc" ]]; then if [[ -z ${prefix} ]]; then @@ -330,7 +331,7 @@ cat <> ${yaml} - mvapich2/2.3.7 EOF set_superbuild_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -344,7 +345,7 @@ cat <> ${yaml} prefix: /usr EOF set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" "${dha_dir}" set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" ;; @@ -384,7 +385,7 @@ cat <> ${yaml} EOF set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "zen3" | "zen4") @@ -482,7 +483,7 @@ cat <> ${yaml} # - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh new file mode 100755 index 00000000000..ed0b2beddca --- /dev/null +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -0,0 +1,108 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=OFF + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which gcc) \ + -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 8f520c7e3ab..e5a3d809fc9 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -161,7 +161,8 @@ set_superbuild_DHA_externals() local mpi="$3" local yaml="$4" local prefix="$5" - local gpu_arch="$6" + local dha_dir="$6" + local gpu_arch="$7" if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" @@ -176,21 +177,21 @@ cat <> ${yaml} - 'master' externals: - spec: aluminum@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aluminum + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/aluminum hydrogen: buildable: false version: - 'develop' externals: - spec: hydrogen@develop arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hydrogen + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/hydrogen dihydrogen: buildable: false version: - 'develop' externals: - spec: dihydrogen@develop arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/dihydrogen + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/dihydrogen EOF } From 91b9a7194d584360dd96d7f48bb9812b01af1a2b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2024 15:31:03 -0700 Subject: [PATCH 021/169] Reenabled half on pascal CI test. --- .gitlab/pascal/pipeline.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 214882e40e3..744503b9493 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -185,8 +185,7 @@ release allocation: # These are system-specific specs that should be forwarded to the # build script - SPACK_SPECS: "+cuda +fft" -# SPACK_SPECS: "+cuda +half +fft" + SPACK_SPECS: "+cuda +half +fft" tags: - pascal From e812fb3f742dbbb5dee1eb73317ad96b224df7ca Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 14 Jun 2024 11:02:40 -0700 Subject: [PATCH 022/169] Allow for newer gcc compilers. --- .gitlab-ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8d555aa6b68..965bf6e969c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,7 +93,8 @@ pascal testing: pascal compiler testing: stage: run-all-clusters variables: - SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" + SPACK_SPECS: "+cuda +half +fft" +# SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" trigger: @@ -104,7 +105,8 @@ pascal distconv testing: stage: run-all-clusters variables: JOB_NAME_SUFFIX: _distconv - SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" + SPACK_SPECS: "+cuda +distconv +fft" +# SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" From ab059707a28f8a33bad041ff3a26facd6d895629 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 14 Jun 2024 13:35:57 -0700 Subject: [PATCH 023/169] Updated all of the pascal CI scripts to use the new stable dependencies. --- .gitlab/pascal/pipeline.yml | 4 ++-- .gitlab/pascal/pipeline_compiler_tests.yml | 11 ++++++----- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 744503b9493..fb0dca3eb6c 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -71,8 +71,8 @@ build and install: - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies - --ci-pip -- %gcc@11.2.1 - +deterministic +vision +numpy +unit_tests ~caliper ${SPACK_SPECS} ^zlib + --ci-pip -- + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) diff --git a/.gitlab/pascal/pipeline_compiler_tests.yml b/.gitlab/pascal/pipeline_compiler_tests.yml index e8633d2e3f5..99a48200f58 100644 --- a/.gitlab/pascal/pipeline_compiler_tests.yml +++ b/.gitlab/pascal/pipeline_compiler_tests.yml @@ -67,8 +67,9 @@ build and install: - !reference [.setup_spack, script] - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip ${BUILD_SCRIPT_OPTIONS} -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) @@ -123,10 +124,10 @@ release allocation: variables: # Just the obvious identifier. Which specific node doesn't matter. SYSTEM_NAME: pascal - # SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} - # SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git - SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test - SPACK_REPO: spack_repos/spack_test.git + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git # Specific extensions to the SPACK_ENV_BASE_NAME SPACK_ENV_BASE_NAME_EXTENSION: -gcc diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 985e282133e..c1207b05e6c 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -72,8 +72,8 @@ cmake \ -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ \ - -D LBANN_SB_BUILD_adiak=OFF \ - -D LBANN_SB_BUILD_Caliper=OFF \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ \ From f60991043437acca63606048606d5755614d18ba Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 14 Jun 2024 16:07:55 -0700 Subject: [PATCH 024/169] Updating the Tioga scripts to use the superbuild. --- .gitlab/tioga/pipeline.yml | 3 +- .../ci/ci_tioga_core_dependencies.sh | 107 ++++++++++++++ scripts/superbuild/ci/ci_tioga_dha.sh | 133 ++++++++++++++++++ scripts/superbuild/ci/ci_tioga_env.sh | 49 +++++++ 4 files changed, 291 insertions(+), 1 deletion(-) create mode 100755 scripts/superbuild/ci/ci_tioga_core_dependencies.sh create mode 100755 scripts/superbuild/ci/ci_tioga_dha.sh create mode 100644 scripts/superbuild/ci/ci_tioga_env.sh diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 7b17fc51949..49c271a86fb 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -77,8 +77,9 @@ build and install: - !reference [.setup_spack, script] - flux proxy ${JOB_ID} flux run -N 1 -t 30m ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh new file mode 100755 index 00000000000..54c0dcf60b6 --- /dev/null +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -0,0 +1,107 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=ON + +# Set to ON if you're on a Cray machine that doesn't provide the AWS +# plugin as part of its default RCCL installation. +# +# It might also be advisable to build this if you build a custom RCCL. +# The configuration script takes a RCCL path as a parameter, so it +# could matter, but it's not clear how much. +BUILD_AWS_OFI_RCCL_PLUGIN=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_TAG=4.x \ + \ + -D LBANN_SB_BUILD_AWS_OFI_RCCL=${BUILD_AWS_OFI_RCCL_PLUGIN}} diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh new file mode 100755 index 00000000000..934a088a62b --- /dev/null +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -0,0 +1,133 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=OFF + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to ON if you're on a Cray machine that doesn't provide the AWS +# plugin as part of its default RCCL installation. +# +# It might also be advisable to build this if you build a custom RCCL. +# The configuration script takes a RCCL path as a parameter, so it +# could matter, but it's not clear how much. +BUILD_AWS_OFI_RCCL_PLUGIN=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ + -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ + -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_LBANN_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ + -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh new file mode 100644 index 00000000000..4369a137f4d --- /dev/null +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -0,0 +1,49 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" + +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" + +# Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) +AMD_GPU_ARCH=gfx90a,gfx942 + + +# Set to the preferred install directory +#INSTALL_PREFIX=${PWD}/install-rocm-distconv +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/rocm-6.1.2 +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ + +# Location of external packages +export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + From 6bdf0df6ae4cd54e0b52bfc3551b37f5fb65224e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 15 Jun 2024 14:30:31 -0700 Subject: [PATCH 025/169] Fixed the sense of the shared variant on protobuf. --- scripts/superbuild_externals.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index e5a3d809fc9..754e47853cb 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -111,7 +111,8 @@ cat <> ${yaml} version: - '3.21.5' externals: - - spec: protobuf@3.21.5~shared arch=${spack_arch} + - spec: protobuf@3.21.5+shared arch=${spack_arch} +# - spec: protobuf@3.21.5~shared arch=${spack_arch} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/protobuf zstr: buildable: false From 899b8249aef398a511d546ff5e6127771895886f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 15 Jun 2024 14:39:09 -0700 Subject: [PATCH 026/169] Updated the AMD ROCm stack to 6.1.2 --- scripts/customize_build_env.sh | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 37f3edd5179..a7f1e2a6f11 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -127,7 +127,7 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3" | "zen4") # Tioga, RZVernal, RZAdams - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.0.3 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.0.3 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; *) @@ -221,8 +221,8 @@ set_center_specific_spack_dependencies() ;; "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" - CENTER_COMPILER="%rocmcc@6.0.3" - CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.0.3 ^python@3.9.12" + CENTER_COMPILER="%rocmcc@6.1.2" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" @@ -398,7 +398,7 @@ EOF cat <> ${yaml} compilers: - compiler: - spec: rocmcc@6.0.3 + spec: rocmcc@6.1.2 paths: cc: amdclang cxx: amdclang++ @@ -411,8 +411,8 @@ cat <> ${yaml} target: any modules: - PrgEnv-amd - - amd/6.0.3 - - rocm/6.0.3 + - amd/6.1.2 + - rocm/6.1.2 environment: {} # extra_rpaths: # - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib @@ -430,7 +430,7 @@ cat <> ${yaml} modules: - PrgEnv-cray - cce/17.0.1 - - rocm/6.0.3 + - rocm/6.1.2 environment: {} extra_rpaths: - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib @@ -444,24 +444,24 @@ cat <> ${yaml} hipcub: buildable: false version: - - '6.0.3' + - '6.1.2' externals: - - spec: hipcub@6.0.3 arch=${spack_arch} - prefix: /opt/rocm-6.0.3/hipcub + - spec: hipcub@6.1.2 arch=${spack_arch} + prefix: /opt/rocm-6.1.2/hipcub rocthrust: buildable: false version: - - '6.0.3' + - '6.1.2' externals: - - spec: rocthrust@6.0.3 arch=${spack_arch} - prefix: /opt/rocm-6.0.3 + - spec: rocthrust@6.1.2 arch=${spack_arch} + prefix: /opt/rocm-6.1.2 llvm-amdgpu: buildable: false version: - - '6.0.3' + - '6.1.2' externals: - - spec: llvm-amdgpu@6.0.3 arch=${spack_arch} - prefix: /opt/rocm-6.0.3/llvm + - spec: llvm-amdgpu@6.1.2 arch=${spack_arch} + prefix: /opt/rocm-6.1.2/llvm cray-libsci: buildable: false version: @@ -469,8 +469,8 @@ cat <> ${yaml} externals: - spec: cray-libsci@24.03.0 %rocmcc arch=${spack_arch} modules: -# - amd/6.0.3 PrgEnv-amd cray-libsci/24.03.0 -# - amd/6.0.3 PrgEnv-amd cray-libsci/23.09.1.1 +# - amd/6.1.2 PrgEnv-amd cray-libsci/24.03.0 +# - amd/6.1.2 PrgEnv-amd cray-libsci/23.09.1.1 - cce/17.0.1 PrgEnv-cray cray-libsci/24.03.0 cray-mpich: buildable: false @@ -479,11 +479,11 @@ cat <> ${yaml} externals: - spec: cray-mpich@8.1.29 %rocmcc arch=${spack_arch} modules: - - amd/6.0.3 PrgEnv-amd cray-mpich/8.1.29 + - amd/6.1.2 PrgEnv-amd cray-mpich/8.1.29 # - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF - set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-6.1.2" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-6.1.2" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; From 0f825f6ac19ba5379a48fa215697f024bbafedfc Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 15 Jun 2024 17:21:32 -0700 Subject: [PATCH 027/169] Adding path for external HWLOC in superbuild stable dependencies. Added code to export the CRAY_LD_LIBRARY_PATH. --- scripts/build_lbann.sh | 1 + scripts/customize_build_env.sh | 2 +- scripts/superbuild_externals.sh | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index d439dd3c1d6..2d7f83a2a43 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1092,6 +1092,7 @@ export LBANN_PYTHON_DIR=\$(dirname ${LBANN_PYTHON}) # Postpend the paths to the build tools to avoid putting system paths up front export PATH=\${PATH}:\${LBANN_CMAKE_DIR}:\${LBANN_NINJA_DIR}:\${LBANN_PYTHON_DIR} export PYTHONPATH=\${LBANN_PYTHONPATH}:\${PYTHONPATH} +export LD_LIBRARY_PATH=\$\{CRAY_LD_LIBRARY_PATH\}:\$\{LD_LIBRARY_PATH\} EOF if [[ -n "${MODULE_CMD}" ]]; then diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index a7f1e2a6f11..479e367dba3 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -222,7 +222,7 @@ set_center_specific_spack_dependencies() "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" CENTER_COMPILER="%rocmcc@6.1.2" - CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12 ^hwloc@3.0.0" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 754e47853cb..9d0e79bc443 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -123,6 +123,18 @@ cat <> ${yaml} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/zstr EOF + if [[ ${dnn_lib} =~ "rocm" ]]; then +cat <> ${yaml} + hwloc: + buildable: false + version: + - '3.0.0' + externals: + - spec: hwloc@3.0.0 arch=${spack_arch} + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hwloc +EOF + fi + if [[ ${dnn_lib} =~ "cuda" ]]; then cat <> ${yaml} nccl: @@ -170,6 +182,9 @@ set_superbuild_DHA_externals() fi # source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh + CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}logs/lbann_sb_suggested_cmake_prefix_path.sh" + echo ${CMD} | tee -a ${LOG} + ${CMD} cat <> ${yaml} aluminum: From c31736efb0c9f4e0be6765064bf2c4442f14c8a4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 15 Jun 2024 17:28:30 -0700 Subject: [PATCH 028/169] Add aws-ofi-rccl to the superbuild externals. --- scripts/superbuild_externals.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 9d0e79bc443..5d781bf8bc0 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -132,6 +132,13 @@ cat <> ${yaml} externals: - spec: hwloc@3.0.0 arch=${spack_arch} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hwloc + aws-ofi-rccl: + buildable: false + version: + - 'cxi' + externals: + - spec: aws-ofi-rccl@cxi arch=${spack_arch} + prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aws_ofi_rccl EOF fi From a97ed1cc533851bc43cd90e0adef6d32162a3f51 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 02:19:11 -0700 Subject: [PATCH 029/169] Fixed typo --- scripts/superbuild_externals.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 5d781bf8bc0..3b140240c5b 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -189,7 +189,7 @@ set_superbuild_DHA_externals() fi # source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh - CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}logs/lbann_sb_suggested_cmake_prefix_path.sh" + CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/logs/lbann_sb_suggested_cmake_prefix_path.sh" echo ${CMD} | tee -a ${LOG} ${CMD} From ea770350d06eff0519465aa28f1b5c3de12582a6 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 08:20:30 -0700 Subject: [PATCH 030/169] Fix how the CMAKE_PREFIX_PATH is forwarded to DHA libraries. --- .../ci/ci_tioga_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha.sh | 20 ++++--------------- scripts/superbuild/ci/ci_tioga_env.sh | 2 +- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 54c0dcf60b6..32ee028519e 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -63,7 +63,7 @@ cmake \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 934a088a62b..c8c3b4c8490 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -65,7 +65,8 @@ BUILD_DIR=${TMPDIR}/lbann-superbuild-dha # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} @@ -74,7 +75,7 @@ cmake \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ @@ -117,17 +118,4 @@ cmake \ -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ - -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ - -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_LBANN_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ - -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 4369a137f4d..b3739abd998 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -43,7 +43,7 @@ INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ # Location of external packages export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} From 9edde7ac78141851c8593a45412f000be2f05b86 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 08:24:23 -0700 Subject: [PATCH 031/169] Fix how the CMAKE_PREFIX_PATH is forwarded to DHA libraries. --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_pascal_dha.sh | 5 +++-- scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 5 +++-- scripts/superbuild/ci/ci_pascal_dha_half.sh | 5 +++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index c1207b05e6c..8fc46488c2c 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -50,7 +50,7 @@ cmake \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index ed0b2beddca..7862455bcd3 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -53,14 +53,15 @@ BUILD_DIR=${TMPDIR}/lbann-superbuild-dha # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} cmake \ -G Ninja \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index a308c6e612b..7962478ea3e 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -52,14 +52,15 @@ BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} cmake \ -G Ninja \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index 3148232a0c6..65fc48412a3 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -53,14 +53,15 @@ BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-half # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} cmake \ -G Ninja \ -S ${SUPERBUILD_SRC_DIR} \ -B ${BUILD_DIR} \ \ - -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ \ From 4cb5ccd86797f998f7e87ced3e5100239737e216 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 08:35:04 -0700 Subject: [PATCH 032/169] Updating the Tioga superbuild scripts to force the runpaths to be properly set. --- scripts/superbuild/ci/ci_pascal_env.sh | 44 +++++++++++++++++++ .../ci/ci_tioga_core_dependencies.sh | 2 + scripts/superbuild/ci/ci_tioga_dha.sh | 2 + scripts/superbuild/ci/ci_tioga_env.sh | 1 + 4 files changed, 49 insertions(+) create mode 100644 scripts/superbuild/ci/ci_pascal_env.sh diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh new file mode 100644 index 00000000000..5e107b064a8 --- /dev/null +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -0,0 +1,44 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" + +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" + +# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) +CUDA_GPU_ARCH=60 + +# Set to the preferred install directory +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/openmpi-4.1.2 + +# Location of external packages +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 32ee028519e..cd1e095d1a3 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -66,6 +66,8 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index c8c3b4c8490..92bc7f5b0e6 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -78,6 +78,8 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index b3739abd998..45001c77040 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -31,6 +31,7 @@ EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" +EXTRA_RPATHS=${CRAYLIBS_X86_64} # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 From 82c0d763fee1b143b530793568e828650b837105 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 08:40:48 -0700 Subject: [PATCH 033/169] Updating the Pascal superbuild scripts to force the runpaths to be properly set. --- scripts/build_lbann.sh | 1 - scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 1 + scripts/superbuild/ci/ci_pascal_dha.sh | 1 + scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 1 + scripts/superbuild/ci/ci_pascal_dha_half.sh | 1 + 5 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 2d7f83a2a43..d439dd3c1d6 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1092,7 +1092,6 @@ export LBANN_PYTHON_DIR=\$(dirname ${LBANN_PYTHON}) # Postpend the paths to the build tools to avoid putting system paths up front export PATH=\${PATH}:\${LBANN_CMAKE_DIR}:\${LBANN_NINJA_DIR}:\${LBANN_PYTHON_DIR} export PYTHONPATH=\${LBANN_PYTHONPATH}:\${PYTHONPATH} -export LD_LIBRARY_PATH=\$\{CRAY_LD_LIBRARY_PATH\}:\$\{LD_LIBRARY_PATH\} EOF if [[ -n "${MODULE_CMD}" ]]; then diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 8fc46488c2c..d7bcbfd31f6 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -53,6 +53,7 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_C_COMPILER=$(which gcc) \ -D CMAKE_CXX_COMPILER=$(which g++) \ diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index 7862455bcd3..146ebff149f 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -64,6 +64,7 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_C_COMPILER=$(which gcc) \ -D CMAKE_CXX_COMPILER=$(which g++) \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index 7962478ea3e..ef54cc584ed 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -63,6 +63,7 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_C_COMPILER=$(which gcc) \ -D CMAKE_CXX_COMPILER=$(which g++) \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index 65fc48412a3..ea70ecdfaf7 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -64,6 +64,7 @@ cmake \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_C_COMPILER=$(which gcc) \ -D CMAKE_CXX_COMPILER=$(which g++) \ From dabeac1fd37474269184831670c508e5d4414f06 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 08:57:19 -0700 Subject: [PATCH 034/169] Added CMake flags to enable shared library builds. --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 2 ++ scripts/superbuild/ci/ci_pascal_dha.sh | 2 ++ scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 2 ++ scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 2 ++ scripts/superbuild/ci/ci_tioga_dha.sh | 2 ++ 5 files changed, 10 insertions(+) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index d7bcbfd31f6..3cff8c8b3cc 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -61,6 +61,8 @@ cmake \ -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ \ diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index 146ebff149f..b38639d6090 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -72,6 +72,8 @@ cmake \ -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index ef54cc584ed..53323aeb10e 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -71,6 +71,8 @@ cmake \ -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ \ diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index cd1e095d1a3..c6ac4886e97 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -73,6 +73,8 @@ cmake \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 92bc7f5b0e6..92cc4ab263c 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -85,6 +85,8 @@ cmake \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ \ From 40f4b5c8886d4f74a34caccee98b700d3205aee3 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 09:24:45 -0700 Subject: [PATCH 035/169] Added a path to cuTensor for x86_64 platforms. --- scripts/superbuild/ci/ci_pascal_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index 5e107b064a8..b5b175882d1 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -40,5 +40,5 @@ INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/openmpi-4.1.2 # Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4 +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-2.0.1.2/libcutensor-linux-x86_64-2.0.1.2-archive CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From 0a5982e5e21f6b57c8ce007a1556dadeac335807 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 13:33:23 -0700 Subject: [PATCH 036/169] Added a path to the correct miopen. --- scripts/superbuild_externals.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 3b140240c5b..75954cb6bd0 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -139,6 +139,13 @@ cat <> ${yaml} externals: - spec: aws-ofi-rccl@cxi arch=${spack_arch} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aws_ofi_rccl + miopen-hip: + buildable: false + version: + - '6.1.2' + externals: + - spec: miopen-hip@6.1.2 arch=${spack_arch} + prefix: /p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen EOF fi From 21fa5cf289ce296ce7f0c119a66bb5ead49a9bf2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 13:49:27 -0700 Subject: [PATCH 037/169] Mark the new MIOpen as develop. --- scripts/customize_build_env.sh | 2 +- scripts/superbuild_externals.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 479e367dba3..ac729c6060b 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -222,7 +222,7 @@ set_center_specific_spack_dependencies() "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" CENTER_COMPILER="%rocmcc@6.1.2" - CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12 ^hwloc@3.0.0" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12 ^hwloc@3.0.0 ^miopen-hip@develop" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 75954cb6bd0..43adccbd0e3 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -142,9 +142,9 @@ cat <> ${yaml} miopen-hip: buildable: false version: - - '6.1.2' + - 'develop' externals: - - spec: miopen-hip@6.1.2 arch=${spack_arch} + - spec: miopen-hip@develop arch=${spack_arch} prefix: /p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen EOF fi From 360837c6676d8fc9cf5ece2666631ee0cbd70477 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 14:08:22 -0700 Subject: [PATCH 038/169] Disable the superbuild on Corona and Lassen --- scripts/customize_build_env.sh | 10 +- .../superbuild/ci/ci_tioga_dha_distconv.sh | 125 ++++++++++++++++++ 2 files changed, 130 insertions(+), 5 deletions(-) create mode 100644 scripts/superbuild/ci/ci_tioga_dha_distconv.sh diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index ac729c6060b..876e6fb1062 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -344,9 +344,9 @@ cat <> ${yaml} - spec: rdma-core@20 arch=${spack_arch} prefix: /usr EOF - set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" "${dha_dir}" - set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + # set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" + # set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" "${dha_dir}" + # set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" ;; "zen" | "zen2") @@ -384,8 +384,8 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" + # set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + # set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "zen3" | "zen4") diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh new file mode 100644 index 00000000000..c2895a2e0eb --- /dev/null +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -0,0 +1,125 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=ON + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to ON if you're on a Cray machine that doesn't provide the AWS +# plugin as part of its default RCCL installation. +# +# It might also be advisable to build this if you build a custom RCCL. +# The configuration script takes a RCCL path as a parameter, so it +# could matter, but it's not clear how much. +BUILD_AWS_OFI_RCCL_PLUGIN=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From 16fd58544385c2660cc90c66ec15391baa42cfda Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 14:10:17 -0700 Subject: [PATCH 039/169] Fixed the install path. --- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 scripts/superbuild/ci/ci_tioga_dha_distconv.sh diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh old mode 100644 new mode 100755 index c2895a2e0eb..3e600664ad2 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -57,10 +57,10 @@ SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh # Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh From 8fc2e734e6640c4b18bfc5cb46fbb2112720956b Mon Sep 17 00:00:00 2001 From: Tom Benson Date: Mon, 17 Jun 2024 18:59:10 -0400 Subject: [PATCH 040/169] Add some logic to clean up the initial CMAKE_INSTALL_RPATH The path auto-generated by Spack may not be ideal. --- CMakeLists.txt | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d3f6a6d75a2..aab290c727c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -427,6 +427,22 @@ if (LBANN_HAS_ROCM) endif () message(STATUS "Using LBANN_ROCM_PATH: ${LBANN_ROCM_PATH}") + if (CMAKE_INSTALL_RPATH) + # The first of these actually matters; the last 4 are just good + # measure. + list(REMOVE_ITEM CMAKE_INSTALL_RPATH + "${LBANN_ROCM_PATH}/lib" + "/usr/lib64" + "/usr/lib" + "/usr/local/lib64" + "/usr/local/lib" + ) + # Write the value out to the cache + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" + CACHE STRING "The install rpath to use" + FORCE) + endif () + find_package(hip CONFIG REQUIRED) enable_language(HIP) find_package(MIOpen CONFIG REQUIRED) @@ -1188,7 +1204,3 @@ install(CODE [===[ message(" module load lbann") message("********************************************************************************") ]===]) - - - - From 07a5de7abaaea0ee5b211be795d514d75644e0a7 Mon Sep 17 00:00:00 2001 From: Tom Benson Date: Mon, 17 Jun 2024 19:24:31 -0400 Subject: [PATCH 041/169] Remove system paths from build rpath --- CMakeLists.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index aab290c727c..f2aba86e3f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -427,6 +427,24 @@ if (LBANN_HAS_ROCM) endif () message(STATUS "Using LBANN_ROCM_PATH: ${LBANN_ROCM_PATH}") + # The Catch2 tests are only ever build artifacts, so this needs to + # change too. + if (CMAKE_BUILD_RPATH) + # The first of these actually matters; the last 4 are just good + # measure. + list(REMOVE_ITEM CMAKE_BUILD_RPATH + "${LBANN_ROCM_PATH}/lib" + "/usr/lib64" + "/usr/lib" + "/usr/local/lib64" + "/usr/local/lib" + ) + # Write the value out to the cache + set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" + CACHE STRING "The build rpath to use" + FORCE) + endif () + if (CMAKE_INSTALL_RPATH) # The first of these actually matters; the last 4 are just good # measure. From 4b32fd66f23932e3820c6e50664139647f45c5c2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 17 Jun 2024 23:32:58 -0700 Subject: [PATCH 042/169] Temporarily disable Corona and Lassen tests. --- .gitlab-ci.yml | 82 +++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 965bf6e969c..e03e013ca29 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -36,50 +36,50 @@ variables: stages: - run-all-clusters -corona testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml +# corona testing: +# stage: run-all-clusters +# variables: +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/corona/pipeline.yml -corona distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" - SPACK_SPECS: "+rocm +distconv" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml +# corona distconv testing: +# stage: run-all-clusters +# variables: +# JOB_NAME_SUFFIX: _distconv +# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" +# SPACK_SPECS: "+rocm +distconv" +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# TEST_FLAG: "test_*_distconv.py" +# trigger: +# strategy: depend +# include: .gitlab/corona/pipeline.yml -lassen testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/lassen/pipeline.yml +# lassen testing: +# stage: run-all-clusters +# variables: +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/lassen/pipeline.yml -lassen distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv" - SPACK_SPECS: "+cuda +distconv +fft" -# SPACK_SPECS: "+cuda +distconv +nvshmem +fft" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/lassen/multi_stage_pipeline.yml +# lassen distconv testing: +# stage: run-all-clusters +# variables: +# JOB_NAME_SUFFIX: _distconv +# SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv" +# SPACK_SPECS: "+cuda +distconv +fft" +# # SPACK_SPECS: "+cuda +distconv +nvshmem +fft" +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# TEST_FLAG: "test_*_distconv.py" +# trigger: +# strategy: depend +# include: .gitlab/lassen/multi_stage_pipeline.yml pascal testing: stage: run-all-clusters From 4034c78f8f2a299fa33b3e1b6acf9d6e9c28dd05 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 19 Jun 2024 01:42:51 -0700 Subject: [PATCH 043/169] Fixed how the CMake environment sets up the PYTHONPATH and caches it in the lbann_pfe.sh and module files. Added hints to the superbuild of where to install necessary Python packages. --- CMakeLists.txt | 2 +- cmake/configure_files/lbann_module.lua.in | 1 + scripts/superbuild/CMakeLists.txt | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f2aba86e3f6..11f7f06fb23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -588,7 +588,7 @@ endif (LBANN_WITH_PYTHON_FRONTEND OR LBANN_WITH_EMBEDDED_PYTHON) if (LBANN_WITH_PYTHON_FRONTEND) set(LBANN_PFE_PYTHON_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "") - set(LBANN_PFE_PYTHONPATH "" CACHE STRING "") + set(LBANN_PFE_PYTHONPATH "${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}" CACHE STRING "") configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_pfe.sh.in" "${CMAKE_BINARY_DIR}/lbann_pfe.sh" diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 2d886995882..88a9846aa81 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -88,6 +88,7 @@ whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") +prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index f2ecb515af0..9507e5a629a 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -154,6 +154,18 @@ if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("-----------------------------------------------------------------\n") endif () +if (LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND) + message("-----------------------------------------------------------------\n") + message("LBANN was built with support for the Python Front End (PFE) (If you need to install it via pip you can in the LBANN site-packages with):") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} pytest") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} protobuf") + if (LBANN_SB_FWD_LBANN_LBANN_WITH_CNPY) + message("\nLBANN was built with support for the NumPy (If you need to install it via pip you can in the LBANN site-packages with):") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} numpy") + endif () + message("\n-----------------------------------------------------------------\n") +endif () + # Add a custom target for bundling all things up if (UNIX) find_program(__FIND_EXE find) From ca797dbd0090c418203a7d18a98145590e34a273 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 19 Jun 2024 03:03:49 -0700 Subject: [PATCH 044/169] Revert back to ROCm 5.7.1 --- scripts/customize_build_env.sh | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 876e6fb1062..f0b82dd11f6 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -127,7 +127,8 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3" | "zen4") # Tioga, RZVernal, RZAdams - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/5.7.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/5.7.1 cmake/3.24.2" +# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; *) @@ -221,8 +222,8 @@ set_center_specific_spack_dependencies() ;; "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" - CENTER_COMPILER="%rocmcc@6.1.2" - CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@6.1.2 ^python@3.9.12 ^hwloc@3.0.0 ^miopen-hip@develop" + CENTER_COMPILER="%rocmcc@5.7.1" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@5.7.1 ^python@3.9.12 ^hwloc@3.0.0 ^miopen-hip@develop" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" @@ -398,7 +399,7 @@ EOF cat <> ${yaml} compilers: - compiler: - spec: rocmcc@6.1.2 + spec: rocmcc@5.7.1 paths: cc: amdclang cxx: amdclang++ @@ -411,8 +412,8 @@ cat <> ${yaml} target: any modules: - PrgEnv-amd - - amd/6.1.2 - - rocm/6.1.2 + - amd/5.7.1 + - rocm/5.7.1 environment: {} # extra_rpaths: # - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib @@ -430,7 +431,7 @@ cat <> ${yaml} modules: - PrgEnv-cray - cce/17.0.1 - - rocm/6.1.2 + - rocm/5.7.1 environment: {} extra_rpaths: - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib @@ -444,24 +445,24 @@ cat <> ${yaml} hipcub: buildable: false version: - - '6.1.2' + - '5.7.1' externals: - - spec: hipcub@6.1.2 arch=${spack_arch} - prefix: /opt/rocm-6.1.2/hipcub + - spec: hipcub@5.7.1 arch=${spack_arch} + prefix: /opt/rocm-5.7.1/hipcub rocthrust: buildable: false version: - - '6.1.2' + - '5.7.1' externals: - - spec: rocthrust@6.1.2 arch=${spack_arch} - prefix: /opt/rocm-6.1.2 + - spec: rocthrust@5.7.1 arch=${spack_arch} + prefix: /opt/rocm-5.7.1 llvm-amdgpu: buildable: false version: - - '6.1.2' + - '5.7.1' externals: - - spec: llvm-amdgpu@6.1.2 arch=${spack_arch} - prefix: /opt/rocm-6.1.2/llvm + - spec: llvm-amdgpu@5.7.1 arch=${spack_arch} + prefix: /opt/rocm-5.7.1/llvm cray-libsci: buildable: false version: @@ -469,8 +470,8 @@ cat <> ${yaml} externals: - spec: cray-libsci@24.03.0 %rocmcc arch=${spack_arch} modules: -# - amd/6.1.2 PrgEnv-amd cray-libsci/24.03.0 -# - amd/6.1.2 PrgEnv-amd cray-libsci/23.09.1.1 +# - amd/5.7.1 PrgEnv-amd cray-libsci/24.03.0 +# - amd/5.7.1 PrgEnv-amd cray-libsci/23.09.1.1 - cce/17.0.1 PrgEnv-cray cray-libsci/24.03.0 cray-mpich: buildable: false @@ -479,11 +480,11 @@ cat <> ${yaml} externals: - spec: cray-mpich@8.1.29 %rocmcc arch=${spack_arch} modules: - - amd/6.1.2 PrgEnv-amd cray-mpich/8.1.29 + - amd/5.7.1 PrgEnv-amd cray-mpich/8.1.29 # - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF - set_superbuild_externals ${host} "rocm-6.1.2" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-6.1.2" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; From 69113df517c0e8cd9b267838c2650918db7ee4f2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 19 Jun 2024 03:04:47 -0700 Subject: [PATCH 045/169] Updated the superbuild scripts to use LDD and Gold linkers as appropriate. Made the Tioga superbuild scripts easier to change to new ROCm versions. --- scripts/superbuild/ci/ci_pascal_env.sh | 2 +- .../ci/ci_tioga_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha.sh | 4 +- .../superbuild/ci/ci_tioga_dha_distconv.sh | 2 +- scripts/superbuild/ci/ci_tioga_env.sh | 11 +- scripts/superbuild/ci/ci_tioga_lbann.sh | 152 ++++++++++++++++++ 6 files changed, 165 insertions(+), 8 deletions(-) create mode 100755 scripts/superbuild/ci/ci_tioga_lbann.sh diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index b5b175882d1..8ce5a9b2e60 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -30,7 +30,7 @@ EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" +EXTRA_LINK_FLAGS="-fuse-ld=gold -Wl,--disable-new-dtags" # Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) CUDA_GPU_ARCH=60 diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index c6ac4886e97..36e3f861b57 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -56,7 +56,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies +BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies-${ROCM_VER} cmake \ -G Ninja \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 92cc4ab263c..775e7b86547 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -60,11 +60,11 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 3e600664ad2..088d91b4d2b 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 45001c77040..82e8299115b 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -30,7 +30,7 @@ EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-Wl,--disable-new-dtags" +EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" EXTRA_RPATHS=${CRAYLIBS_X86_64} # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) @@ -39,12 +39,17 @@ AMD_GPU_ARCH=gfx90a,gfx942 # Set to the preferred install directory #INSTALL_PREFIX=${PWD}/install-rocm-distconv -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/rocm-6.1.2 +ROCM_VER=$(basename ${ROCM_PATH}) +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ # Location of external packages -export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen +#export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +# Patch up the header while dealing with MiOpen +#export CPLUS_INCLUDE_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen/include:${CPLUS_INCLUDE_PATH} + +#CPLUS_INCLUDE_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen/include:${CPLUS_INCLUDE_PATH} export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh new file mode 100755 index 00000000000..f57f1fcb738 --- /dev/null +++ b/scripts/superbuild/ci/ci_tioga_lbann.sh @@ -0,0 +1,152 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +################################################################ +# Parse command-line arguments +################################################################ + +# while :; do +# case ${1} in +# -h|--help) +# # Help message +# help_message +# exit 1 +# ;; +# --install_prefix) +# if [ -n "${2}" ]; then +# INSTALL_PREFIX=${2} +# shift +# else +# echo "\"${1}\" option requires a non-empty option argument" >&2 +# exit 1 +# fi +# ;; +# *) +# # Break loop if there are no more options +# break +# esac +# shift +# done + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=OFF + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to ON if you're on a Cray machine that doesn't provide the AWS +# plugin as part of its default RCCL installation. +# +# It might also be advisable to build this if you build a custom RCCL. +# The configuration script takes a RCCL path as a parameter, so it +# could matter, but it's not clear how much. +BUILD_AWS_OFI_RCCL_PLUGIN=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Set to the preferred install directory +#INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha + +# Set to the preferred build directory +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-lbann-${ROCM_VER} + +# Set to the preferred install directory +INSTALL_PREFIX=${LBANN_SRC_DIR}/superbuild/install_${ROCM_VER} + +# Update the location of external packages +#source ${INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh +source ${INSTALL_PREFIX_EXTERNALS}/dha/logs/lbann_sb_suggested_cmake_prefix_path.sh +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ + -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ + -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_LBANN_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_CNPY=ON \ + -D LBANN_SB_FWD_LBANN_LBANN_DETERMINISTIC=ON \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_ADDRESS_SANITIZER=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_NVSHMEM=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_FFT=OFF \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_EMBEDDED_PYTHON=ON \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND=ON \ + -D LBANN_SB_FWD_LBANN_LBANN_WITH_VISION=ON \ + -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From a022170acb1af20b39f3f1908cf17e7f452d2541 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 19 Jun 2024 04:52:51 -0700 Subject: [PATCH 046/169] Removing custom MIOpen build. --- scripts/customize_build_env.sh | 2 +- scripts/superbuild_externals.sh | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index f0b82dd11f6..c587c5f6724 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -223,7 +223,7 @@ set_center_specific_spack_dependencies() "zen3" | "zen4") # Tioga, RZVernal # CENTER_COMPILER="%cce@17.0.1" CENTER_COMPILER="%rocmcc@5.7.1" - CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@5.7.1 ^python@3.9.12 ^hwloc@3.0.0 ^miopen-hip@develop" + CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@5.7.1 ^python@3.9.12 ^hwloc@3.0.0" CENTER_BLAS_LIBRARY="blas=libsci" # Override the conduit variants for the cray compilers CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 43adccbd0e3..3b140240c5b 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -139,13 +139,6 @@ cat <> ${yaml} externals: - spec: aws-ofi-rccl@cxi arch=${spack_arch} prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aws_ofi_rccl - miopen-hip: - buildable: false - version: - - 'develop' - externals: - - spec: miopen-hip@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen EOF fi From e2e8a172f28b8c804bf815aee622bbfc1772a763 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 19 Jun 2024 15:24:21 -0700 Subject: [PATCH 047/169] Added the build modules to the LBANN_DEPENDENT_MODULES so that they are loaded at runtime since the RPATH and RUNPATH isn't capturing certain Cray packages. --- .gitlab/tioga/pipeline.yml | 1 + scripts/build_lbann.sh | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 49c271a86fb..9bfbc2f0633 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -80,6 +80,7 @@ build and install: --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib + # lbann_pfe.sh -m ensurepip --upgrade - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index d439dd3c1d6..7a4622fc60b 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1094,7 +1094,12 @@ export PATH=\${PATH}:\${LBANN_CMAKE_DIR}:\${LBANN_NINJA_DIR}:\${LBANN_PYTHON_DIR export PYTHONPATH=\${LBANN_PYTHONPATH}:\${PYTHONPATH} EOF +BUILD_MODULES= if [[ -n "${MODULE_CMD}" ]]; then + BUILD_MODULES=${MODULE_CMD//module load /} +# echo "BVE here is ${BUILD_MODULES}" + BUILD_MODULES=${BUILD_MODULES// /;} +# echo "2nd time BVE here is ${BUILD_MODULES}" cat >> ${LBANN_SETUP_FILE}< Date: Wed, 19 Jun 2024 16:03:22 -0700 Subject: [PATCH 048/169] Fixed how the LBANN_DEPENDENT_MODULES are composed. --- scripts/build_lbann.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 7a4622fc60b..63867aada05 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1110,7 +1110,7 @@ EOF # Build a list of modules that LBANN should load LBANN_WRITE_DEPENDENT_MODULEPATH="${LBANN_MODFILES_DIR}/Core" LBANN_DEPENDENT_MODULES=$(spack-python $SCRIPTS_DIR/find_externals_and_lbann_top_level_dependencies.py) - LBANN_DEPENDENT_MODULES=${BUILD_MODULES};${LBANN_DEPENDENT_MODULES} + LBANN_DEPENDENT_MODULES="${BUILD_MODULES};${LBANN_DEPENDENT_MODULES}" if [[ "${CENTER_COMPILER}" =~ .*"%clang".* ]]; then # If the compiler is clang use the LLD fast linker CENTER_LINKER_FLAGS="+lld" From 545ef71a98db6bea7bb77f8331ab60b58ec786c7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 20 Jun 2024 00:45:29 -0700 Subject: [PATCH 049/169] Temporarily reduce the time for Tioga jobs --- .gitlab/tioga/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 9bfbc2f0633..698f44e4f8a 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -52,7 +52,7 @@ allocate lc resources: script: - echo "== ACQUIRING FLUX RESOURCES ==" - echo "${WITH_WEEKLY:+Running with --weekly}" - - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "120m") + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "60m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - export JOB_ID="UNALLOCATED" From 161e799512f7b9cb5839372d34f1c6a9ce38825a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 20 Jun 2024 01:42:06 -0700 Subject: [PATCH 050/169] Try a different set of modules for Tioga. --- scripts/customize_build_env.sh | 4 +++- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index c587c5f6724..cc7677b8672 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -127,7 +127,9 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3" | "zen4") # Tioga, RZVernal, RZAdams - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/5.7.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/5.7.1 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/24.03.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray/8.5.0 flux_wrappers/0.1 StdEnv cmake/3.24.2 rocm/5.7.1" + +# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/5.7.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/5.7.1 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 088d91b4d2b..b0e94b1308e 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -64,7 +64,7 @@ BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} From 2617c30a5cde45559d6ecbb682701f1b9d9d27d4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 20 Jun 2024 02:17:11 -0700 Subject: [PATCH 051/169] Trimming time/ --- .gitlab/tioga/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 698f44e4f8a..80e218671fe 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -52,7 +52,7 @@ allocate lc resources: script: - echo "== ACQUIRING FLUX RESOURCES ==" - echo "${WITH_WEEKLY:+Running with --weekly}" - - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "60m") + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "40m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - export JOB_ID="UNALLOCATED" From 91997e758a59642e20c9b8a890f640a346c826dc Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 24 Jun 2024 16:57:55 -0700 Subject: [PATCH 052/169] Fixed grouping on link flags. Fixed RPATH issues for build and install objects. --- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 5 +++-- scripts/superbuild/ci/ci_tioga_dha.sh | 5 +++-- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 5 +++-- scripts/superbuild/ci/ci_tioga_env.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_lbann.sh | 9 +++++++-- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 36e3f861b57..192a2dd4c53 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -68,6 +68,7 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ @@ -75,8 +76,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 775e7b86547..e25febe1f5d 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -80,6 +80,7 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ @@ -87,8 +88,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index b0e94b1308e..5db4b4fdb5b 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -80,6 +80,7 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ @@ -87,8 +88,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 82e8299115b..cdc0cd2a340 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -26,8 +26,8 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-O2 -g -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-O2 -g -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh index f57f1fcb738..b6fe55d6116 100755 --- a/scripts/superbuild/ci/ci_tioga_lbann.sh +++ b/scripts/superbuild/ci/ci_tioga_lbann.sh @@ -100,6 +100,10 @@ CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} #export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} +# -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ +#-DCMAKE_BUILD_RPATH=${EXTRA_RPATHS} +#-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON +# -D CMAKE_BUILD_WITH_INSTALL_RPATH=ON \ cmake \ -G Ninja \ @@ -111,6 +115,7 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ @@ -118,8 +123,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ From a91fea621352c6a0c3c6f15b0bb06d0bfd321698 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 11:43:36 -0700 Subject: [PATCH 053/169] Increasing the precision of the reported error for check metric. --- src/callbacks/check_metric.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/callbacks/check_metric.cpp b/src/callbacks/check_metric.cpp index 611e2b16bfc..b3e8f8f79cd 100644 --- a/src/callbacks/check_metric.cpp +++ b/src/callbacks/check_metric.cpp @@ -37,6 +37,7 @@ #include "lbann/proto/callbacks.pb.h" +#include #include #include #include @@ -59,11 +60,19 @@ check_metric::check_metric(std::string metric_name, if (lower_bound > upper_bound) { std::stringstream err; err << "callback \"" << name() << "\" " - << "got an invalid range for metric values " + << "got an invalid range for metric values " << std::setprecision(9) << "(lower bound " << m_lower_bound << ", " << "upper bound " << m_upper_bound << ")"; LBANN_ERROR(err.str()); } + if (lower_bound == upper_bound) { + std::stringstream err; + err << "callback \"" << name() << "\" " + << "got an zero range for metric values " << std::setprecision(9) + << "(lower bound " << m_lower_bound << " == " + << "upper bound " << m_upper_bound << ")"; + LBANN_WARNING(err.str()); + } } check_metric::check_metric() : check_metric("", {}, 0, 0, false) {} @@ -122,8 +131,8 @@ void check_metric::do_check_metric(const model& m) const if (!(m_lower_bound <= value && value <= m_upper_bound)) { err << "callback \"" << name() << "\" expected " << "metric \"" << m_metric_name << "\" " - << "to have a value in range " - << "[" << m_lower_bound << "," << m_upper_bound << "], " + << "to have a value in range " << std::setprecision(9) << "[" + << m_lower_bound << "," << m_upper_bound << "], " << "but found a value of " << value; if (m_error_on_failure) { LBANN_ERROR(err.str()); From 22108ac5ab0b99727c46af21521f2cd8250a3384 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 11:44:07 -0700 Subject: [PATCH 054/169] Force the installation of pip packages in the installed location to avoid bad system install. --- scripts/build_lbann.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 63867aada05..426fcb8c5a7 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1278,7 +1278,7 @@ echo "To manipulate the version of python used it is:" | tee -a ${LOG} echo " ${LBANN_PYTHON}" | tee -a ${LOG} echo "Additional Python packages for working with LBANN can be added either via PIP or by concretizing them together in spack., activate the spack environment then" | tee -a ${LOG} echo "To install them via PIP: 1) the spack environment (see above) and 2) issue the following command" | tee -a ${LOG} -echo " python3 -m pip install -r " | tee -a ${LOG} +echo " python3 -m pip install --force-reinstall -r " | tee -a ${LOG} echo "To install them via Spack: include them on the build_lbann.sh script command line argument via -e or -p " | tee -a ${LOG} echo "##########################################################################################" | tee -a ${LOG} echo "All details of the run are logged to ${LOG}" From 7565e809faf51fc8b3b7c972e5630693eed7a539 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 11:45:08 -0700 Subject: [PATCH 055/169] Fixed the time. --- .gitlab/tioga/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 80e218671fe..9ab80136b61 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -52,7 +52,7 @@ allocate lc resources: script: - echo "== ACQUIRING FLUX RESOURCES ==" - echo "${WITH_WEEKLY:+Running with --weekly}" - - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "40m") + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "90m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - export JOB_ID="UNALLOCATED" From a8e7c0caf5f0e13cfed7ed0e1cbc84c7983afd90 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 15:28:19 -0700 Subject: [PATCH 056/169] Correctly set the --force-reinstall flag on the pip command. --- scripts/build_lbann.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 426fcb8c5a7..30f3af8f18a 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1044,9 +1044,9 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then for p in ${PIP_EXTRAS} do if [[ -e "${p}" ]]; then - CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} -r ${p}" + CMD="python3 -m pip install --force-reinstall -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} -r ${p}" else - CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} ${p}" + CMD="python3 -m pip install --force-reinstall -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} ${p}" fi echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } @@ -1278,7 +1278,7 @@ echo "To manipulate the version of python used it is:" | tee -a ${LOG} echo " ${LBANN_PYTHON}" | tee -a ${LOG} echo "Additional Python packages for working with LBANN can be added either via PIP or by concretizing them together in spack., activate the spack environment then" | tee -a ${LOG} echo "To install them via PIP: 1) the spack environment (see above) and 2) issue the following command" | tee -a ${LOG} -echo " python3 -m pip install --force-reinstall -r " | tee -a ${LOG} +echo " python3 -m pip install -r " | tee -a ${LOG} echo "To install them via Spack: include them on the build_lbann.sh script command line argument via -e or -p " | tee -a ${LOG} echo "##########################################################################################" | tee -a ${LOG} echo "All details of the run are logged to ${LOG}" From a21824328140a8659d4c798bd2ec58480b859cf2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 16:31:38 -0700 Subject: [PATCH 057/169] Correcting the nightly time limit. --- .gitlab/tioga/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 9ab80136b61..9bfbc2f0633 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -52,7 +52,7 @@ allocate lc resources: script: - echo "== ACQUIRING FLUX RESOURCES ==" - echo "${WITH_WEEKLY:+Running with --weekly}" - - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "90m") + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "150m" || echo "120m") - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") - export FLUX_F58_FORCE_ASCII=t - export JOB_ID="UNALLOCATED" From 7ed3b5e3bc61cb394770030d12aa1ca74203e23b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 16:33:48 -0700 Subject: [PATCH 058/169] Set the CXX and CUDA flags to an optimized build. --- scripts/superbuild/ci/ci_pascal_env.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index 8ce5a9b2e60..934c5eb18e0 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -26,8 +26,10 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-O2 -g -Wno-deprecated-declarations" +#EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +#EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-O2 -g -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=gold -Wl,--disable-new-dtags" From 6919899bcc8ba369e04e143e9e00e0ed333e349b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 16:34:31 -0700 Subject: [PATCH 059/169] Updated the Tioga builds to include the PE_ENV field in the stable dependencies pathname. --- scripts/customize_build_env.sh | 9 +++++---- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 2 +- scripts/superbuild/ci/ci_tioga_env.sh | 3 ++- scripts/superbuild/ci/ci_tioga_lbann.sh | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index cc7677b8672..b8bbcccb228 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -127,9 +127,9 @@ set_center_specific_modules() # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3" | "zen4") # Tioga, RZVernal, RZAdams - MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/24.03.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray/8.5.0 flux_wrappers/0.1 StdEnv cmake/3.24.2 rocm/5.7.1" +# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/24.03.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray/8.5.0 flux_wrappers/0.1 StdEnv cmake/3.24.2 rocm/5.7.1" -# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/5.7.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/5.7.1 cmake/3.24.2" + MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv amd/5.7.1 rocm/5.7.1 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; @@ -485,8 +485,9 @@ cat <> ${yaml} - amd/5.7.1 PrgEnv-amd cray-mpich/8.1.29 # - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF - set_superbuild_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" + PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + set_superbuild_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 192a2dd4c53..81638f8760a 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -56,7 +56,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies-${ROCM_VER} +BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies-${PE_ENV_lc}-${ROCM_VER} cmake \ -G Ninja \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index e25febe1f5d..f33dc716d4e 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-${ROCM_VER} +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-${PE_ENV_lc}-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 5db4b4fdb5b..4ea2c791ad1 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv-${ROCM_VER} +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv-${PE_ENV_lc}-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index cdc0cd2a340..d356d1919f7 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -40,7 +40,8 @@ AMD_GPU_ARCH=gfx90a,gfx942 # Set to the preferred install directory #INSTALL_PREFIX=${PWD}/install-rocm-distconv ROCM_VER=$(basename ${ROCM_PATH}) -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER} +PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${PE_ENV_lc}//${ROCM_VER} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ # Location of external packages diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh index b6fe55d6116..c1cef0fcf37 100755 --- a/scripts/superbuild/ci/ci_tioga_lbann.sh +++ b/scripts/superbuild/ci/ci_tioga_lbann.sh @@ -87,7 +87,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh #INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-lbann-${ROCM_VER} +BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-lbann-${PE_ENV_lc}-${ROCM_VER} # Set to the preferred install directory INSTALL_PREFIX=${LBANN_SRC_DIR}/superbuild/install_${ROCM_VER} From 1ca419923e37552d621b6e655d559dd1b9a23fc8 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 18:39:06 -0700 Subject: [PATCH 060/169] Updated the build path so that the source files can be saved for debugging. --- scripts/superbuild/ci/ci_pascal_env.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 2 +- scripts/superbuild/ci/ci_tioga_env.sh | 9 ++++++--- scripts/superbuild/ci/ci_tioga_lbann.sh | 2 +- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index 934c5eb18e0..54b7a0537a5 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -26,10 +26,10 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-O2 -g -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" #EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" #EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-O2 -g -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=gold -Wl,--disable-new-dtags" diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 81638f8760a..2aab738031b 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -56,7 +56,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies-${PE_ENV_lc}-${ROCM_VER} cmake \ -G Ninja \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index f33dc716d4e..7d93c052ab6 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-${PE_ENV_lc}-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 4ea2c791ad1..842ae95a23a 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv-${PE_ENV_lc}-${ROCM_VER} # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index d356d1919f7..46e9835cdb9 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -26,8 +26,8 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-O2 -g -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-O2 -g -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" @@ -41,9 +41,12 @@ AMD_GPU_ARCH=gfx90a,gfx942 #INSTALL_PREFIX=${PWD}/install-rocm-distconv ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${PE_ENV_lc}//${ROCM_VER} +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${PE_ENV_lc}/${ROCM_VER} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ +# Use an accessible build directory so that the source files are preserved for debuggin +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${PE_ENV_lc}/${ROCM_VER} + # Location of external packages #export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh index c1cef0fcf37..6b8b9e1bfcc 100755 --- a/scripts/superbuild/ci/ci_tioga_lbann.sh +++ b/scripts/superbuild/ci/ci_tioga_lbann.sh @@ -87,7 +87,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh #INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-lbann-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-lbann-${PE_ENV_lc}-${ROCM_VER} # Set to the preferred install directory INSTALL_PREFIX=${LBANN_SRC_DIR}/superbuild/install_${ROCM_VER} From 4a3501b317103747583748b4cc4d0324e2256536 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 18:41:39 -0700 Subject: [PATCH 061/169] Updated the build path so that the source files can be saved for debugging on pascal. --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_pascal_dha.sh | 2 +- scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 2 +- scripts/superbuild/ci/ci_pascal_dha_half.sh | 2 +- scripts/superbuild/ci/ci_pascal_env.sh | 3 +++ 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 3cff8c8b3cc..9c4fce61b4f 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -41,7 +41,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-core-dependencies +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies #export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index b38639d6090..b14fb10913d 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -48,7 +48,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index 53323aeb10e..e9dc7dee8c5 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -48,7 +48,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-distconv +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index ea70ecdfaf7..be5dc6013ec 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -48,7 +48,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_half # Set to the preferred build directory -BUILD_DIR=${TMPDIR}/lbann-superbuild-dha-half +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-half # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index 54b7a0537a5..1dd7c48d67a 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -41,6 +41,9 @@ CUDA_GPU_ARCH=60 INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/openmpi-4.1.2 +# Use an accessible build directory so that the source files are preserved for debuggin +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${PE_ENV_lc}/${ROCM_VER} + # Location of external packages export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-2.0.1.2/libcutensor-linux-x86_64-2.0.1.2-archive CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From ad64cb90dc98022b2795e0b398e79283c33ebd7a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 18:43:26 -0700 Subject: [PATCH 062/169] Removed the pip force-reinstall --- scripts/build_lbann.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 30f3af8f18a..63867aada05 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1044,9 +1044,9 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then for p in ${PIP_EXTRAS} do if [[ -e "${p}" ]]; then - CMD="python3 -m pip install --force-reinstall -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} -r ${p}" + CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} -r ${p}" else - CMD="python3 -m pip install --force-reinstall -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} ${p}" + CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${LBANN_INSTALL_DIR} ${p}" fi echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } From 997d4ed916ba1057fad5c93812131591be8792eb Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 21:07:03 -0700 Subject: [PATCH 063/169] Fixed pascal build path. --- scripts/superbuild/ci/ci_pascal_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index 1dd7c48d67a..aab7f1945dc 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -42,7 +42,7 @@ INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/openmpi-4.1.2 # Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${PE_ENV_lc}/${ROCM_VER} +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/pascal/cuda-11.8.0 # Location of external packages export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-2.0.1.2/libcutensor-linux-x86_64-2.0.1.2-archive From a7c6c6f0c0a51ae393f914cbb68e4279b02bfd52 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 22:15:10 -0700 Subject: [PATCH 064/169] Fixed the quotes around the linker flags. --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 4 ++-- scripts/superbuild/ci/ci_pascal_dha.sh | 4 ++-- scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 4 ++-- scripts/superbuild/ci/ci_pascal_dha_half.sh | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 9c4fce61b4f..469f93d9259 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -63,8 +63,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_CUDA_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index b14fb10913d..9ae2b9d47fc 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -74,8 +74,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_CUDA_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index e9dc7dee8c5..470b86b2b72 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -73,8 +73,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_CUDA_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index be5dc6013ec..a49b27823cb 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -72,8 +72,8 @@ cmake \ -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ - -D CMAKE_EXE_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ - -D CMAKE_SHARED_LINKER_FLAGS=${EXTRA_LINK_FLAGS} \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_CUDA_STANDARD=17 \ From 352a8d34ed0af86e3300266ce88658cf5d3e28a5 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 22:37:47 -0700 Subject: [PATCH 065/169] Do not use gold linker for core dependencies because protobuf fails. --- scripts/superbuild/ci/ci_pascal_env.sh | 3 ++- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index aab7f1945dc..be049453b32 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -32,7 +32,8 @@ EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" EXTRA_CUDA_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-fuse-ld=gold -Wl,--disable-new-dtags" +EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" +EXTRA_LINK_FLAGS="-fuse-ld=gold ${EXTRA_LINK_FLAGS_CORE}" # Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) CUDA_GPU_ARCH=60 diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 2aab738031b..27ffacab513 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -76,8 +76,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ From 1860023337891cbd792ff6188d707a9747bdd99b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 22:42:07 -0700 Subject: [PATCH 066/169] Fixed typo --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index 469f93d9259..e4d8a92bfe6 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -63,8 +63,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_CUDA_STANDARD=17 \ diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 27ffacab513..2aab738031b 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -76,8 +76,8 @@ cmake \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ -D CMAKE_CXX_STANDARD=17 \ -D CMAKE_HIP_STANDARD=17 \ From 1b08f50a5c68e9344da8abe60a583e92782f57b2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 25 Jun 2024 23:03:08 -0700 Subject: [PATCH 067/169] Updated the version of half to 2.2.0 --- scripts/superbuild/ci/ci_pascal_dha_half.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index a49b27823cb..da74d34c910 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -52,7 +52,7 @@ BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-half # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.2.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} From 440dc06be028daef73930e870ec71b951985f0f1 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 01:03:43 -0700 Subject: [PATCH 068/169] Did not set the loaded modules in the LBANN module file. --- scripts/build_lbann.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 63867aada05..a55005bdf81 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1110,7 +1110,7 @@ EOF # Build a list of modules that LBANN should load LBANN_WRITE_DEPENDENT_MODULEPATH="${LBANN_MODFILES_DIR}/Core" LBANN_DEPENDENT_MODULES=$(spack-python $SCRIPTS_DIR/find_externals_and_lbann_top_level_dependencies.py) - LBANN_DEPENDENT_MODULES="${BUILD_MODULES};${LBANN_DEPENDENT_MODULES}" +# LBANN_DEPENDENT_MODULES="${BUILD_MODULES};${LBANN_DEPENDENT_MODULES}" if [[ "${CENTER_COMPILER}" =~ .*"%clang".* ]]; then # If the compiler is clang use the LLD fast linker CENTER_LINKER_FLAGS="+lld" From 3e18d2545ac126106a761e17a29df7b5b20e9649 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 09:57:19 -0700 Subject: [PATCH 069/169] Include ROCM_PATH/lib to RPATH. Switch Pascal back to gcc/10.3.1. --- scripts/customize_build_env.sh | 5 +++-- scripts/superbuild/ci/ci_tioga_env.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index b8bbcccb228..2a89adefcd5 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -114,8 +114,8 @@ set_center_specific_modules() ;; "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface - MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/11.2.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" -# MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/10.3.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" +# MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/11.2.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" + MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/10.3.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" # Note that clang is installed in /usr/workspace/brain/tom/pascal/llvm/latest/ and it is version 17.0.0 MODULE_CMD_CLANG="module load gcc/10.3.1 cuda/11.8.0 mvapich2/2.3.7 python/3.9.12" ;; @@ -486,6 +486,7 @@ cat <> ${yaml} # - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + echo "BVE Using the Cray programming environment ${PE_ENV_lc}" set_superbuild_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" set_superbuild_DHA_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 46e9835cdb9..5a9ed477d82 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -31,7 +31,7 @@ EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" -EXTRA_RPATHS=${CRAYLIBS_X86_64} +EXTRA_RPATHS="${CRAYLIBS_X86_64};${ROCM_PATH}/lib" # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 From 8ece61d130b8d6d70cb4bee0a74f7f97f5419884 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 13:21:16 -0700 Subject: [PATCH 070/169] Switch Pascal CI to using Clang 14. Added compiler into the CI superbuild external paths. --- scripts/customize_build_env.sh | 15 ++-- .../ci/ci_pascal_core_dependencies.sh | 6 +- scripts/superbuild/ci/ci_pascal_dha.sh | 8 +- .../superbuild/ci/ci_pascal_dha_distconv.sh | 6 +- scripts/superbuild/ci/ci_pascal_dha_half.sh | 8 +- scripts/superbuild/ci/ci_pascal_env.sh | 11 ++- .../ci/ci_tioga_core_dependencies.sh | 2 +- scripts/superbuild/ci/ci_tioga_dha.sh | 2 +- .../superbuild/ci/ci_tioga_dha_distconv.sh | 2 +- scripts/superbuild/ci/ci_tioga_env.sh | 5 +- scripts/superbuild_externals.sh | 87 ++++++++++--------- 11 files changed, 81 insertions(+), 71 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 2a89adefcd5..63faf5bb8b0 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -117,7 +117,8 @@ set_center_specific_modules() # MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/11.2.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" MODULE_CMD_GCC="module load jobutils/1.0 StdEnv gcc/10.3.1-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" # Note that clang is installed in /usr/workspace/brain/tom/pascal/llvm/latest/ and it is version 17.0.0 - MODULE_CMD_CLANG="module load gcc/10.3.1 cuda/11.8.0 mvapich2/2.3.7 python/3.9.12" +# MODULE_CMD_CLANG="module load gcc/10.3.1 cuda/11.8.0 mvapich2/2.3.7 python/3.9.12" + MODULE_CMD_CLANG="module load jobutils/1.0 StdEnv clang/14.0.6-magic ninja/1.11.1 openmpi/4.1.2 cuda/11.8.0 python/3.9.12" ;; "ivybridge" | "cascadelake") # Catalyst, Ruby MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" @@ -204,8 +205,8 @@ set_center_specific_spack_dependencies() "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface # On LC the mvapich2 being used is built against HWLOC v1 CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-11.2.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" - CENTER_COMPILER="%gcc" -# CENTER_COMPILER="%clang" +# CENTER_COMPILER="%gcc" + CENTER_COMPILER="%clang" # DEPENDENTS_CENTER_COMPILER="%gcc@10.3.1" # There is something weird about the python@3.9.13 on Pascal right now 5/31/2023 CENTER_DEPENDENCIES="^openmpi@4.1.2" @@ -333,8 +334,8 @@ cat <> ${yaml} modules: - mvapich2/2.3.7 EOF - set_superbuild_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" "${dha_dir}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -487,8 +488,8 @@ cat <> ${yaml} EOF PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') echo "BVE Using the Cray programming environment ${PE_ENV_lc}" - set_superbuild_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "${PE_ENV_lc}/rocm-5.7.1" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}""cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index e4d8a92bfe6..e8791a34765 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -55,10 +55,10 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ - -D CMAKE_C_COMPILER=$(which gcc) \ - -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_C_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ -D BUILD_SHARED_LIBS=ON \ diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index 9ae2b9d47fc..b4df0224a97 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -52,7 +52,7 @@ BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} @@ -66,10 +66,10 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ - -D CMAKE_C_COMPILER=$(which gcc) \ - -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_C_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ -D BUILD_SHARED_LIBS=ON \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index 470b86b2b72..7d786d59d5c 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -65,10 +65,10 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ - -D CMAKE_C_COMPILER=$(which gcc) \ - -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_C_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ -D BUILD_SHARED_LIBS=ON \ diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index da74d34c910..112c664f5db 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -52,7 +52,7 @@ BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-half # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.2.0:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.2.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} @@ -66,10 +66,10 @@ cmake \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ - -D CMAKE_C_COMPILER=$(which gcc) \ - -D CMAKE_CXX_COMPILER=$(which g++) \ + -D CMAKE_C_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v g++) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ -D CMAKE_Fortran_COMPILER=$(which gfortran) \ \ -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index be049453b32..df13038ef3d 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -33,17 +33,20 @@ EXTRA_CUDA_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" -EXTRA_LINK_FLAGS="-fuse-ld=gold ${EXTRA_LINK_FLAGS_CORE}" +EXTRA_LINK_FLAGS="-fuse-ld=lld ${EXTRA_LINK_FLAGS_CORE}" # Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) CUDA_GPU_ARCH=60 +CUDA_VER=cuda-11.8.0 +COMPILER_VER=clang-14.0.6-magic # Set to the preferred install directory -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/pascal/cuda-11.8.0 -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/openmpi-4.1.2 +CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies +INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/pascal/${CUDA_VER} +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/openmpi-4.1.2 # Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/pascal/cuda-11.8.0 +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/pascal/${CUDA_VER}/${COMPILER_VER} # Location of external packages export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-2.0.1.2/libcutensor-linux-x86_64-2.0.1.2-archive diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index 2aab738031b..f77b390391f 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -56,7 +56,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} # Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies cmake \ -G Ninja \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 7d93c052ab6..945bc3d6d60 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha # Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 842ae95a23a..cd676eb06f6 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -60,7 +60,7 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv # Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv-${PE_ENV_lc}-${ROCM_VER} +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 5a9ed477d82..3a66f1cd95b 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -36,16 +36,15 @@ EXTRA_RPATHS="${CRAYLIBS_X86_64};${ROCM_PATH}/lib" # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 - # Set to the preferred install directory #INSTALL_PREFIX=${PWD}/install-rocm-distconv ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${PE_ENV_lc}/${ROCM_VER} +INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER}/${PE_ENV_lc} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ # Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${PE_ENV_lc}/${ROCM_VER} +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${ROCM_VER}/${PE_ENV_lc} # Location of external packages #export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 3b140240c5b..6b62397faaf 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -2,11 +2,12 @@ set_superbuild_externals() { local system="$1" local dnn_lib="$2" - local mpi="$3" - local yaml="$4" - local LOG="$5" - local prefix="$6" - local gpu_arch="$7" + local compiler_ver="$3" + local mpi="$4" + local yaml="54" + local LOG="$6" + local prefix="$7" + local gpu_arch="$8" if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" @@ -15,8 +16,9 @@ set_superbuild_externals() #/usr/workspace/lbann/stable_dependencies/rzvernal/rocm-5.7.1/mi300a/cray-mpich-8.1.27 #/usr/workspace/lbann/lbann-superbuild/ - - CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" + CMD="source ${prefix}/${sb_extra_prefix}/logs/lbann_sb_suggested_cmake_prefix_path.sh" # CMD="source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" echo ${CMD} | tee -a ${LOG} ${CMD} @@ -28,84 +30,84 @@ cat <> ${yaml} - 'master' externals: - spec: adiak@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/adiak + prefix: ${prefix}/${sb_extra_prefix}/adiak caliper: buildable: false version: - 'master' externals: - spec: caliper@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/caliper + prefix: ${prefix}/${sb_extra_prefix}/caliper catch2: buildable: false version: - '2.9.2' externals: - spec: catch2@2.9.2 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/catch2 + prefix: ${prefix}/${sb_extra_prefix}/catch2 hdf5: buildable: false version: - '1.10.9' externals: - spec: hdf5@1.10.9 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hdf5 + prefix: ${prefix}/${sb_extra_prefix}/hdf5 jpeg-turbo: buildable: false version: - '2.0.3' externals: - spec: jpeg-turbo@2.0.3 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/jpeg-turbo + prefix: ${prefix}/${sb_extra_prefix}/jpeg-turbo spdlog: buildable: false version: - '1.12.0' externals: - spec: spdlog@1.12.0 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/spdlog + prefix: ${prefix}/${sb_extra_prefix}/spdlog cereal: buildable: false version: - '1.3.0' externals: - spec: cereal@1.3.0 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/cereal + prefix: ${prefix}/${sb_extra_prefix}/cereal clara: buildable: false version: - '1.1.5' externals: - spec: clara@1.1.5 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/clara + prefix: ${prefix}/${sb_extra_prefix}/clara cnpy: buildable: false version: - 'master' externals: - spec: cnpy@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/cnpy + prefix: ${prefix}/${sb_extra_prefix}/cnpy conduit: buildable: false version: - 'develop' externals: - spec: conduit@develop arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/conduit + prefix: ${prefix}/${sb_extra_prefix}/conduit hiptt: buildable: false version: - 'master' externals: - spec: hiptt@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hiptt + prefix: ${prefix}/${sb_extra_prefix}/hiptt opencv: buildable: false version: - '4.1.0' externals: - spec: opencv@4.1.0 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/opencv + prefix: ${prefix}/${sb_extra_prefix}/opencv protobuf: buildable: false version: @@ -113,14 +115,14 @@ cat <> ${yaml} externals: - spec: protobuf@3.21.5+shared arch=${spack_arch} # - spec: protobuf@3.21.5~shared arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/protobuf + prefix: ${prefix}/${sb_extra_prefix}/protobuf zstr: buildable: false version: - 'master' externals: - spec: zstr@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/zstr + prefix: ${prefix}/${sb_extra_prefix}/zstr EOF if [[ ${dnn_lib} =~ "rocm" ]]; then @@ -131,14 +133,14 @@ cat <> ${yaml} - '3.0.0' externals: - spec: hwloc@3.0.0 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/hwloc + prefix: ${prefix}/${sb_extra_prefix}/hwloc aws-ofi-rccl: buildable: false version: - 'cxi' externals: - spec: aws-ofi-rccl@cxi arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/aws_ofi_rccl + prefix: ${prefix}/${sb_extra_prefix}/aws_ofi_rccl EOF fi @@ -150,7 +152,7 @@ cat <> ${yaml} - '2.19.4' externals: - spec: nccl@2.19.4 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/nccl + prefix: ${prefix}/${sb_extra_prefix}/nccl cudnn: buildable: false version: @@ -177,19 +179,22 @@ EOF set_superbuild_DHA_externals() { local system="$1" - local dnn_lib="$2" - local mpi="$3" - local yaml="$4" - local prefix="$5" - local dha_dir="$6" - local gpu_arch="$7" + local compiler_ver="$3" + local dnn_lib="$3" + local mpi="$4" + local yaml="$5" + local LOG="$6" + local prefix="$7" + local dha_dir="$8" + local gpu_arch="$9" if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" fi + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" # source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh - CMD="source ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + CMD="source ${prefix}/${sb_extra_prefix}/${dha_dir}/logs/lbann_sb_suggested_cmake_prefix_path.sh" echo ${CMD} | tee -a ${LOG} ${CMD} @@ -200,21 +205,21 @@ cat <> ${yaml} - 'master' externals: - spec: aluminum@master arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/aluminum + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/aluminum hydrogen: buildable: false version: - 'develop' externals: - spec: hydrogen@develop arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/hydrogen + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/hydrogen dihydrogen: buildable: false version: - 'develop' externals: - spec: dihydrogen@develop arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/${dha_dir}/dihydrogen + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/dihydrogen EOF } @@ -222,16 +227,18 @@ set_superbuild_power_externals() { local system="$1" local dnn_lib="$2" - local mpi="$3" - local yaml="$4" - local prefix="$5" - local gpu_arch="$6" + local compiler_ver="$3" + local mpi="$4" + local yaml="$5" + local prefix="$6" + local gpu_arch="$7" if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" fi -# source ${prefix}/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" +# source ${prefix}/${sb_extra_prefix}/logs/lbann_sb_suggested_cmake_prefix_path.sh cat <> ${yaml} openblas: @@ -240,6 +247,6 @@ cat <> ${yaml} - '0.3.6' externals: - spec: openblas@0.3.6 arch=${spack_arch} - prefix: ${prefix}/${system}/${dnn_lib}/${mpi}/openblas + prefix: ${prefix}/${sb_extra_prefix}/openblas EOF } From 1d6d4ff6cd4e3169664a9fa5f0a52a94f6f47610 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 13:53:38 -0700 Subject: [PATCH 071/169] Fixed compiler paths and typos. --- scripts/customize_build_env.sh | 3 ++- scripts/superbuild_externals.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 63faf5bb8b0..0fd53e5e944 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -204,7 +204,8 @@ set_center_specific_spack_dependencies() ;; "broadwell" | "haswell" | "sandybridge") # Pascal, RZHasGPU, Surface # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-11.2.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" +# CENTER_COMPILER_PATHS="/usr/tce/packages/gcc/gcc-11.2.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" + CENTER_COMPILER_PATHS="/usr/tce/packages/clang/clang-14.0.6-magic /usr/tce/packages/gcc/gcc-10.3.1-magic /usr/workspace/brain/tom/pascal/llvm/latest/" # CENTER_COMPILER="%gcc" CENTER_COMPILER="%clang" # DEPENDENTS_CENTER_COMPILER="%gcc@10.3.1" diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 6b62397faaf..00143444fa9 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -179,8 +179,8 @@ EOF set_superbuild_DHA_externals() { local system="$1" + local dnn_lib="$2" local compiler_ver="$3" - local dnn_lib="$3" local mpi="$4" local yaml="$5" local LOG="$6" From 0d8c7cabaa5894642246e64b3e6097d269e742db Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 14:15:37 -0700 Subject: [PATCH 072/169] Fixed typo. --- scripts/superbuild_externals.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 00143444fa9..80c3f8d40bf 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -4,7 +4,7 @@ set_superbuild_externals() local dnn_lib="$2" local compiler_ver="$3" local mpi="$4" - local yaml="54" + local yaml="$5" local LOG="$6" local prefix="$7" local gpu_arch="$8" From 086617852ddf53d67d850d2d41b9e700f81e6166 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 14:56:17 -0700 Subject: [PATCH 073/169] Commented out unused variable. --- src/callbacks/check_gradients.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index 1e9e0f2cb84..82923533948 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -246,7 +246,7 @@ void check_gradients::do_check_gradients(model& m) const auto& c = static_cast(m.get_execution_context()); auto& comm = *m.get_comm(); const auto mode = c.get_execution_mode(); - const auto& layers = m.get_layers(); + // const auto& layers = m.get_layers(); // Return immediately if gradient check isn't currently needed if (!m_modes.empty() && m_modes.count(mode) == 0) { From d5176b188a8e9e477a4414e3366a631444d8351c Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 16:46:42 -0700 Subject: [PATCH 074/169] Log file for superbuild shell script is now defined in the environment rather than passed as an argument. --- scripts/build_lbann.sh | 14 +++++++------- scripts/customize_build_env.sh | 12 ++++++------ scripts/superbuild_externals.sh | 16 +++++++--------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index a55005bdf81..18d8a1ec215 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -368,6 +368,13 @@ fi # Detect system parameters CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +LOG="spack-build-${LBANN_ENV}.log" +if [[ -f ${LOG} ]]; then + CMD="rm ${LOG}" + echo ${CMD} + [[ -z "${DRY_RUN:-}" ]] && ${CMD} +fi + # Identify the center that we are running at CENTER= # Customize the build based on the center @@ -396,13 +403,6 @@ else AT_LBANN_LABEL="" fi -LOG="spack-build-${LBANN_ENV}.log" -if [[ -f ${LOG} ]]; then - CMD="rm ${LOG}" - echo ${CMD} - [[ -z "${DRY_RUN:-}" ]] && ${CMD} -fi - LBANN_BUILD_LABEL="lbann_${CLUSTER}_${LBANN_LABEL}" LBANN_BUILD_PARENT_DIR="${LBANN_HOME}/builds/${LBANN_BUILD_LABEL}" LBANN_BUILD_DIR="${LBANN_BUILD_PARENT_DIR}/build" diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 0fd53e5e944..1601cabd233 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -335,8 +335,8 @@ cat <> ${yaml} modules: - mvapich2/2.3.7 EOF - set_superbuild_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -349,7 +349,7 @@ cat <> ${yaml} - spec: rdma-core@20 arch=${spack_arch} prefix: /usr EOF - # set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" "${prefix}" + # set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" # set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" @@ -389,7 +389,7 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - # set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" "${prefix}" + # set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" # set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; @@ -489,8 +489,8 @@ cat <> ${yaml} EOF PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') echo "BVE Using the Cray programming environment ${PE_ENV_lc}" - set_superbuild_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}""cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${LOG}" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 80c3f8d40bf..02dc661f0d5 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -5,16 +5,15 @@ set_superbuild_externals() local compiler_ver="$3" local mpi="$4" local yaml="$5" - local LOG="$6" - local prefix="$7" - local gpu_arch="$8" - + local prefix="$6" + local gpu_arch="$7" + if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" fi #/usr/workspace/lbann/stable_dependencies/rzvernal/rocm-5.7.1/mi300a/cray-mpich-8.1.27 - + #/usr/workspace/lbann/lbann-superbuild/ local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" @@ -183,10 +182,9 @@ set_superbuild_DHA_externals() local compiler_ver="$3" local mpi="$4" local yaml="$5" - local LOG="$6" - local prefix="$7" - local dha_dir="$8" - local gpu_arch="$9" + local prefix="$6" + local dha_dir="$7" + local gpu_arch="$8" if [ -n "${gpu_arch}" ]; then dnn_lib="${dnn_lib}/${gpu_arch}" From dec483aa73474dad783ec18e2f5c3476ffaadac8 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 17:46:12 -0700 Subject: [PATCH 075/169] Fixed the extra RPATH on cray. --- scripts/superbuild/ci/ci_tioga_env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 3a66f1cd95b..45ef392e40f 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -31,7 +31,8 @@ EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" -EXTRA_RPATHS="${CRAYLIBS_X86_64};${ROCM_PATH}/lib" +# If using PrgEnv-cray add ${CRAYLIBS_X86_64} +EXTRA_RPATHS="${ROCM_PATH}/lib" # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 @@ -55,4 +56,3 @@ CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} #CPLUS_INCLUDE_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen/include:${CPLUS_INCLUDE_PATH} export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - From 2f60f1bcf799e62f20d61019d28067f0ba3c96f9 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 18:07:00 -0700 Subject: [PATCH 076/169] Switched back to half v2.1.0. Added logging for the modules used to build the superbuild. --- scripts/superbuild/ci/ci_pascal_core_dependencies.sh | 3 +++ scripts/superbuild/ci/ci_pascal_dha.sh | 3 +++ scripts/superbuild/ci/ci_pascal_dha_distconv.sh | 3 +++ scripts/superbuild/ci/ci_pascal_dha_half.sh | 5 ++++- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 3 +++ scripts/superbuild/ci/ci_tioga_dha.sh | 3 +++ scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 3 +++ scripts/superbuild_externals.sh | 7 +++++++ 8 files changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh index e8791a34765..2239fc5f60c 100755 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh @@ -94,3 +94,6 @@ cmake \ -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_OpenCV_TAG=4.x + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh index b4df0224a97..ebd1cc9927b 100755 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ b/scripts/superbuild/ci/ci_pascal_dha.sh @@ -110,3 +110,6 @@ cmake \ -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh index 7d786d59d5c..f16e65dc45c 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh @@ -109,3 +109,6 @@ cmake \ -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh index 112c664f5db..0243b15dabc 100755 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ b/scripts/superbuild/ci/ci_pascal_dha_half.sh @@ -52,7 +52,7 @@ BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-half # Update the location of external packages source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.2.0:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} @@ -108,3 +108,6 @@ cmake \ -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index f77b390391f..bf8f7c9a769 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -110,3 +110,6 @@ cmake \ -D LBANN_SB_OpenCV_TAG=4.x \ \ -D LBANN_SB_BUILD_AWS_OFI_RCCL=${BUILD_AWS_OFI_RCCL_PLUGIN}} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 945bc3d6d60..758b479a68f 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -124,3 +124,6 @@ cmake \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index cd676eb06f6..95e7bb9630f 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -124,3 +124,6 @@ cmake \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 02dc661f0d5..6c0204a9aec 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -44,6 +44,13 @@ cat <> ${yaml} externals: - spec: catch2@2.9.2 arch=${spack_arch} prefix: ${prefix}/${sb_extra_prefix}/catch2 + half: + buildable: false + version: + - '2.1.0' + externals: + - spec: half@2.1.0 arch=${spack_arch} + prefix: ${prefix}/half-2.1.0 hdf5: buildable: false version: From 4272c863b8d8d934004a20593b0f9decc874fb24 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 26 Jun 2024 23:30:00 -0700 Subject: [PATCH 077/169] Fixing the extra RPATHs field to handle multiple entries. --- scripts/superbuild/ci/ci_tioga_core_dependencies.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_dha.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_dha_distconv.sh | 4 ++-- scripts/superbuild/ci/ci_tioga_env.sh | 2 +- scripts/superbuild/ci/ci_tioga_lbann.sh | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index bf8f7c9a769..d3593772eae 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -67,8 +67,8 @@ cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ - -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh index 758b479a68f..6b040488ce8 100755 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ b/scripts/superbuild/ci/ci_tioga_dha.sh @@ -79,8 +79,8 @@ cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ - -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh index 95e7bb9630f..43d09e6b709 100755 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh @@ -79,8 +79,8 @@ cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ - -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 45ef392e40f..54cbec0b8bd 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -32,7 +32,7 @@ EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" # If using PrgEnv-cray add ${CRAYLIBS_X86_64} -EXTRA_RPATHS="${ROCM_PATH}/lib" +EXTRA_RPATHS="${ROCM_PATH}/lib\;${ROCM_PATH}/llvm/lib" # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh index 6b8b9e1bfcc..d1fdf82370a 100755 --- a/scripts/superbuild/ci/ci_tioga_lbann.sh +++ b/scripts/superbuild/ci/ci_tioga_lbann.sh @@ -114,8 +114,8 @@ cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH=${EXTRA_RPATHS} \ - -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ \ -D CMAKE_C_COMPILER=$(which amdclang) \ -D CMAKE_CXX_COMPILER=$(which amdclang++) \ From d39d5fee320982946eed3a74bdf253fe40f79311 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 27 Jun 2024 13:48:00 -0700 Subject: [PATCH 078/169] Add an updated time limit for the reconstruction loss unit test. --- ci_test/unit_tests/test_unit_reconstruction_loss.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci_test/unit_tests/test_unit_reconstruction_loss.py b/ci_test/unit_tests/test_unit_reconstruction_loss.py index 3b6644f8cd4..0fd2921f801 100644 --- a/ci_test/unit_tests/test_unit_reconstruction_loss.py +++ b/ci_test/unit_tests/test_unit_reconstruction_loss.py @@ -18,6 +18,7 @@ def skeleton_jag_reconstruction_loss(cluster, dir_name, cluster=cluster, num_nodes=2, num_processes=32, + time_limit=3, disable_cuda=1, dir_name=dir_name, sample_list_train_default='/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt', From bfea5e680cd6a84b8fe67d0a3b66adde6518237a Mon Sep 17 00:00:00 2001 From: Tom Benson Date: Thu, 27 Jun 2024 16:48:57 -0400 Subject: [PATCH 079/169] Add EnsureComm calls to truncation selection algo --- .../ltfb/truncation_selection_exchange.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp b/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp index 6d18a431f71..a80dba25d00 100644 --- a/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp +++ b/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp @@ -150,6 +150,14 @@ void TruncationSelectionExchange::select_next(model& m, data_coordinator& dc) const { auto const& comm = *(m.get_comm()); + + El::mpi::EnsureComm( + comm.get_world_comm(), + El::SyncInfo{}); + El::mpi::EnsureComm( + comm.get_world_comm(), + El::SyncInfo{}); + const unsigned int num_trainers = comm.get_num_trainers(); const unsigned int trainer_id = comm.get_trainer_rank(); auto const step = ctxt.get_step(); From 4a8ecafe84b0695b9b4dbb8b40858b6d94c836d0 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 27 Jun 2024 13:55:29 -0700 Subject: [PATCH 080/169] Use a vertical | to avoid issues propagating ;. --- scripts/superbuild/ci/ci_tioga_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 54cbec0b8bd..04f960f4656 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -32,7 +32,7 @@ EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" # If using PrgEnv-cray add ${CRAYLIBS_X86_64} -EXTRA_RPATHS="${ROCM_PATH}/lib\;${ROCM_PATH}/llvm/lib" +EXTRA_RPATHS="${ROCM_PATH}/lib|${ROCM_PATH}/llvm/lib" # Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) AMD_GPU_ARCH=gfx90a,gfx942 From d133abee25f045afa8b5beeb67680b092c87fb2f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 27 Jun 2024 16:25:26 -0700 Subject: [PATCH 081/169] Constrain version of NumPy to 1.22.3 --- ci_test/requirements.txt | 1 + scripts/common_python_packages/requirements.txt | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci_test/requirements.txt b/ci_test/requirements.txt index 6ada807c10f..50787b5adb5 100644 --- a/ci_test/requirements.txt +++ b/ci_test/requirements.txt @@ -1,3 +1,4 @@ pytest +numpy==1.22.3 scipy tqdm diff --git a/scripts/common_python_packages/requirements.txt b/scripts/common_python_packages/requirements.txt index 73932bcc38c..7e0b09d32ad 100644 --- a/scripts/common_python_packages/requirements.txt +++ b/scripts/common_python_packages/requirements.txt @@ -1,3 +1,2 @@ -numpy +numpy==1.22.3 pytest - From 4fe80e092fbba0413992fded603375526677f4ba Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 28 Jun 2024 11:37:38 -0700 Subject: [PATCH 082/169] Removed the -02 optimization flags from the pascal and tioga environments because it will be set by the CMake build type. Added a superbuild package for hipTT. --- scripts/superbuild/CMakeLists.txt | 1 + scripts/superbuild/ci/ci_pascal_env.sh | 6 +- scripts/superbuild/ci/ci_tioga_env.sh | 4 +- scripts/superbuild/hiptt/CMakeLists.txt | 78 +++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 scripts/superbuild/hiptt/CMakeLists.txt diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 9507e5a629a..7918b3f7c49 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -88,6 +88,7 @@ lbann_sb_add_packages( Clara CNPY Conduit + hiptt Hydrogen # DiHydrogen depends on H. DiHydrogen OpenCV diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index df13038ef3d..b8a8003d88b 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -26,10 +26,8 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" -#EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -#EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-O2 -g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 04f960f4656..9fa5b3048e5 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -26,8 +26,8 @@ # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" diff --git a/scripts/superbuild/hiptt/CMakeLists.txt b/scripts/superbuild/hiptt/CMakeLists.txt new file mode 100644 index 00000000000..3309f8d2398 --- /dev/null +++ b/scripts/superbuild/hiptt/CMakeLists.txt @@ -0,0 +1,78 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ +lbann_sb_init_extern_pkg( + NAME hiptt + LANGUAGES C CXX HIP + GITHUB_URL tbennun/hipTT.git + GIT_TAG "master") + +# The build system here is just a set of makefiles. +find_program(GNU_MAKE_PROGRAM make) + +include (ExternalProject) +ExternalProject_Add(${PKG_NAME} + PREFIX "${CMAKE_CURRENT_BINARY_DIR}" + ${LBANN_SB_GIT_REPOSITORY_TAG} ${LBANN_SB_${PKG_NAME}_URL} + ${LBANN_SB_GIT_TAG_TAG} ${LBANN_SB_${PKG_NAME}_TAG} + TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp" + STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp" + + SOURCE_DIR "${LBANN_SB_${PKG_NAME}_SOURCE_DIR}" + INSTALL_DIR "${LBANN_SB_${PKG_NAME}_PREFIX}" + + GIT_SHALLOW 1 + + BUILD_IN_SOURCE 1 + USES_TERMINAL_BUILD 1 + LOG_DOWNLOAD 1 + LOG_UPDATE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + LOG_TEST 1 + + CONFIGURE_COMMAND "" + + BUILD_COMMAND + ${GNU_MAKE_PROGRAM} + "PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}" + "CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}" + "CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}" + -j${${PKG_NAME}_MAX_MAKE_JOBS} + + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory + ${LBANN_SB_${PKG_NAME}_SOURCE_DIR}/include + ${LBANN_SB_${PKG_NAME}_PREFIX}/include + + COMMAND + ${CMAKE_COMMAND} -E copy_directory + ${LBANN_SB_${PKG_NAME}_SOURCE_DIR}/lib + ${LBANN_SB_${PKG_NAME}_PREFIX}/lib + +) + +set(${PKG_NAME}_DIR ${LBANN_SB_${PKG_NAME}_PREFIX} + CACHE INTERNAL "The install prefix of ${PKG_NAME}.") From 831adc571fb474c6fa496abc491dcbbc5e35af06 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 28 Jun 2024 12:36:06 -0700 Subject: [PATCH 083/169] Added superbuild scripts for Corona. Added hipTT to build_lbann.sh build script set. Updated Corona to 5.7.1. Re-enabled the Corona CI builds. --- .gitlab-ci.yml | 40 +++--- .gitlab/corona/pipeline.yml | 3 +- scripts/customize_build_env.sh | 31 ++--- .../ci/ci_corona_core_dependencies.sh | 106 ++++++++++++++++ scripts/superbuild/ci/ci_corona_dha.sh | 119 ++++++++++++++++++ .../superbuild/ci/ci_corona_dha_distconv.sh | 119 ++++++++++++++++++ scripts/superbuild/ci/ci_corona_env.sh | 53 ++++++++ .../ci/ci_tioga_core_dependencies.sh | 1 + scripts/superbuild_externals.sh | 7 ++ 9 files changed, 443 insertions(+), 36 deletions(-) create mode 100755 scripts/superbuild/ci/ci_corona_core_dependencies.sh create mode 100755 scripts/superbuild/ci/ci_corona_dha.sh create mode 100755 scripts/superbuild/ci/ci_corona_dha_distconv.sh create mode 100644 scripts/superbuild/ci/ci_corona_env.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e03e013ca29..0f6e76c95e4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -36,27 +36,27 @@ variables: stages: - run-all-clusters -# corona testing: -# stage: run-all-clusters -# variables: -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/corona/pipeline.yml +corona testing: + stage: run-all-clusters + variables: + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + trigger: + strategy: depend + include: .gitlab/corona/pipeline.yml -# corona distconv testing: -# stage: run-all-clusters -# variables: -# JOB_NAME_SUFFIX: _distconv -# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" -# SPACK_SPECS: "+rocm +distconv" -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# TEST_FLAG: "test_*_distconv.py" -# trigger: -# strategy: depend -# include: .gitlab/corona/pipeline.yml +corona distconv testing: + stage: run-all-clusters + variables: + JOB_NAME_SUFFIX: _distconv + SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" + SPACK_SPECS: "+rocm +distconv" + WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" + WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" + TEST_FLAG: "test_*_distconv.py" + trigger: + strategy: depend + include: .gitlab/corona/pipeline.yml # lassen testing: # stage: run-all-clusters diff --git a/.gitlab/corona/pipeline.yml b/.gitlab/corona/pipeline.yml index 8458c06eea5..4820883cd62 100644 --- a/.gitlab/corona/pipeline.yml +++ b/.gitlab/corona/pipeline.yml @@ -73,8 +73,9 @@ build and install: - !reference [.setup_spack, script] - flux proxy ${JOB_ID} flux run -N 1 -t 30m ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 1601cabd233..6fd5e3c9990 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -124,7 +124,8 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" + MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.1" +# MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; "zen3" | "zen4") # Tioga, RZVernal, RZAdams @@ -220,8 +221,8 @@ set_center_specific_spack_dependencies() ;; "zen" | "zen2") # Corona # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER="%rocmcc@5.7.0" - CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@5.7.0 ^python@3.9.12 ^py-protobuf@4.21.5" + CENTER_COMPILER="%rocmcc@5.7.1" + CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@5.7.1 ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; "zen3" | "zen4") # Tioga, RZVernal @@ -360,25 +361,25 @@ cat <> ${yaml} hipcub: buildable: false version: - - '5.7.0' + - '5.7.1' externals: - - spec: hipcub@5.7.0 arch=${spack_arch} - prefix: /opt/rocm-5.7.0/hipcub + - spec: hipcub@5.7.1 arch=${spack_arch} + prefix: /opt/rocm-5.7.1/hipcub extra_attributes: compilers: - c: /opt/rocm-5.7.0/llvm/bin/clang - c++: /opt/rocm-5.7.0/llvm/bin/clang++ + c: /opt/rocm-5.7.1/llvm/bin/clang + c++: /opt/rocm-5.7.1/llvm/bin/clang++ llvm-amdgpu: buildable: false version: - - '5.7.0' + - '5.7.1' externals: - - spec: llvm-amdgpu@5.7.0 arch=${spack_arch} - prefix: /opt/rocm-5.7.0/llvm + - spec: llvm-amdgpu@5.7.1 arch=${spack_arch} + prefix: /opt/rocm-5.7.1/llvm extra_attributes: compilers: - c: /opt/rocm-5.7.0/llvm/bin/clang - c++: /opt/rocm-5.7.0/llvm/bin/clang++ + c: /opt/rocm-5.7.1/llvm/bin/clang + c++: /opt/rocm-5.7.1/llvm/bin/clang++ openmpi: buildable: false version: @@ -389,8 +390,8 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - # set_superbuild_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" - # set_superbuild_DHA_externals ${host} "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-5.7.1" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "zen3" | "zen4") diff --git a/scripts/superbuild/ci/ci_corona_core_dependencies.sh b/scripts/superbuild/ci/ci_corona_core_dependencies.sh new file mode 100755 index 00000000000..1b3f6f96377 --- /dev/null +++ b/scripts/superbuild/ci/ci_corona_core_dependencies.sh @@ -0,0 +1,106 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_TAG=4.x + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_corona_dha.sh b/scripts/superbuild/ci/ci_corona_dha.sh new file mode 100755 index 00000000000..007a949a449 --- /dev/null +++ b/scripts/superbuild/ci/ci_corona_dha.sh @@ -0,0 +1,119 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=OFF + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_corona_dha_distconv.sh b/scripts/superbuild/ci/ci_corona_dha_distconv.sh new file mode 100755 index 00000000000..c81dceec739 --- /dev/null +++ b/scripts/superbuild/ci/ci_corona_dha_distconv.sh @@ -0,0 +1,119 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN +BUILD_LBANN_STACK=ON + +# Set to ON to enable DistConv support. Only matters if building the +# LBANN stack. +BUILD_WITH_DISTCONV=ON + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=OFF + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv + +# Update the location of external packages +source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ + -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ + \ + -D CMAKE_C_COMPILER=$(which amdclang) \ + -D CMAKE_CXX_COMPILER=$(which amdclang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ + \ + -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_corona_env.sh b/scripts/superbuild/ci/ci_corona_env.sh new file mode 100644 index 00000000000..b7841caf522 --- /dev/null +++ b/scripts/superbuild/ci/ci_corona_env.sh @@ -0,0 +1,53 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" + +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" +# If using PrgEnv-cray add ${CRAYLIBS_X86_64} +EXTRA_RPATHS="${ROCM_PATH}/lib|${ROCM_PATH}/llvm/lib" + +# Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) +AMD_GPU_ARCH=gfx906 + +ROCM_VER=$(basename ${ROCM_PATH}) +COMPILER_VER=clang-14.0.6-magic +# Set to the preferred install directory +CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies +INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/corona/${ROCM_VER} +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/openmpi-4.1.2 + +# Use an accessible build directory so that the source files are preserved for debugging +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/corona/${ROCM_VER}/${COMPILER_VER} + +# Location of external packages +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index d3593772eae..e46c1badf3e 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -97,6 +97,7 @@ cmake \ -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 6c0204a9aec..67559fc6882 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -147,6 +147,13 @@ cat <> ${yaml} externals: - spec: aws-ofi-rccl@cxi arch=${spack_arch} prefix: ${prefix}/${sb_extra_prefix}/aws_ofi_rccl + hiptt: + buildable: false + version: + - 'master' + externals: + - spec: hiptt@master arch=${spack_arch} + prefix: ${prefix}/${sb_extra_prefix}/hiptt EOF fi From 84ad3bf5705efd7cd9b46c1a9fda53459515185a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 28 Jun 2024 12:44:00 -0700 Subject: [PATCH 084/169] Moved the definition of the external hiptt to a ROCm only section. --- scripts/superbuild_externals.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index 67559fc6882..a574257386f 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -100,13 +100,6 @@ cat <> ${yaml} externals: - spec: conduit@develop arch=${spack_arch} prefix: ${prefix}/${sb_extra_prefix}/conduit - hiptt: - buildable: false - version: - - 'master' - externals: - - spec: hiptt@master arch=${spack_arch} - prefix: ${prefix}/${sb_extra_prefix}/hiptt opencv: buildable: false version: From 14cf442234492cb325347d4d559e01edbe248ace Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 1 Jul 2024 13:35:53 -0700 Subject: [PATCH 085/169] Update Corona to ROCm 6.0.2 --- scripts/customize_build_env.sh | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 6fd5e3c9990..dc95cc1f8b5 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -124,7 +124,7 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.1" + MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/6.0.2" # MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; @@ -221,8 +221,8 @@ set_center_specific_spack_dependencies() ;; "zen" | "zen2") # Corona # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER="%rocmcc@5.7.1" - CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@5.7.1 ^python@3.9.12 ^py-protobuf@4.21.5" + CENTER_COMPILER="%rocmcc@6.0.2" + CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@6.0.2 ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; "zen3" | "zen4") # Tioga, RZVernal @@ -361,25 +361,25 @@ cat <> ${yaml} hipcub: buildable: false version: - - '5.7.1' + - '6.0.2' externals: - - spec: hipcub@5.7.1 arch=${spack_arch} - prefix: /opt/rocm-5.7.1/hipcub + - spec: hipcub@6.0.2 arch=${spack_arch} + prefix: /opt/rocm-6.0.2/hipcub extra_attributes: compilers: - c: /opt/rocm-5.7.1/llvm/bin/clang - c++: /opt/rocm-5.7.1/llvm/bin/clang++ + c: /opt/rocm-6.0.2/llvm/bin/clang + c++: /opt/rocm-6.0.2/llvm/bin/clang++ llvm-amdgpu: buildable: false version: - - '5.7.1' + - '6.0.2' externals: - - spec: llvm-amdgpu@5.7.1 arch=${spack_arch} - prefix: /opt/rocm-5.7.1/llvm + - spec: llvm-amdgpu@6.0.2 arch=${spack_arch} + prefix: /opt/rocm-6.0.2/llvm extra_attributes: compilers: - c: /opt/rocm-5.7.1/llvm/bin/clang - c++: /opt/rocm-5.7.1/llvm/bin/clang++ + c: /opt/rocm-6.0.2/llvm/bin/clang + c++: /opt/rocm-6.0.2/llvm/bin/clang++ openmpi: buildable: false version: @@ -390,8 +390,8 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals ${host} "rocm-5.7.1" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-5.7.1" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-6.0.2" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-6.0.2" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "zen3" | "zen4") From 8755cb779a6c9354b054b759f038eae3cf31a1ca Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 2 Jul 2024 09:57:20 -0700 Subject: [PATCH 086/169] Changed the Corona externals to use variable for ROCm version. --- scripts/customize_build_env.sh | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index dc95cc1f8b5..b943511295f 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -124,7 +124,8 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/6.0.2" + CI_ROCM_VER="5.7.1" + MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/${CI_ROCM_VER}" # MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 ;; @@ -221,8 +222,8 @@ set_center_specific_spack_dependencies() ;; "zen" | "zen2") # Corona # On LC the mvapich2 being used is built against HWLOC v1 - CENTER_COMPILER="%rocmcc@6.0.2" - CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@6.0.2 ^python@3.9.12 ^py-protobuf@4.21.5" + CENTER_COMPILER="%rocmcc@${CI_ROCM_VER}" + CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@${CI_ROCM_VER} ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; "zen3" | "zen4") # Tioga, RZVernal @@ -361,25 +362,25 @@ cat <> ${yaml} hipcub: buildable: false version: - - '6.0.2' + - '${CI_ROCM_VER}' externals: - - spec: hipcub@6.0.2 arch=${spack_arch} - prefix: /opt/rocm-6.0.2/hipcub + - spec: hipcub@${CI_ROCM_VER} arch=${spack_arch} + prefix: /opt/rocm-${CI_ROCM_VER}/hipcub extra_attributes: compilers: - c: /opt/rocm-6.0.2/llvm/bin/clang - c++: /opt/rocm-6.0.2/llvm/bin/clang++ + c: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang + c++: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang++ llvm-amdgpu: buildable: false version: - - '6.0.2' + - '${CI_ROCM_VER}' externals: - - spec: llvm-amdgpu@6.0.2 arch=${spack_arch} - prefix: /opt/rocm-6.0.2/llvm + - spec: llvm-amdgpu@${CI_ROCM_VER} arch=${spack_arch} + prefix: /opt/rocm-${CI_ROCM_VER}/llvm extra_attributes: compilers: - c: /opt/rocm-6.0.2/llvm/bin/clang - c++: /opt/rocm-6.0.2/llvm/bin/clang++ + c: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang + c++: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang++ openmpi: buildable: false version: @@ -390,8 +391,8 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals ${host} "rocm-6.0.2" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" - set_superbuild_DHA_externals ${host} "rocm-6.0.2" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" + set_superbuild_externals ${host} "rocm-${CI_ROCM_VER}" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-${CI_ROCM_VER}" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "zen3" | "zen4") From dc3f1f3e63dfbebf06c16032ed0aabcce33655f6 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 2 Jul 2024 10:04:21 -0700 Subject: [PATCH 087/169] Exporting the shell variable. --- scripts/customize_build_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index b943511295f..2672ac78535 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -124,7 +124,7 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - CI_ROCM_VER="5.7.1" + export CI_ROCM_VER="5.7.1" MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/${CI_ROCM_VER}" # MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 From 00833eb2b87ba5c84bc5f3c58a04393daae36fba Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 2 Jul 2024 10:21:20 -0700 Subject: [PATCH 088/169] Moved when the ROCm version is defined. --- scripts/customize_build_env.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 2672ac78535..df26c2213df 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -124,7 +124,7 @@ set_center_specific_modules() MODULE_CMD="module load gcc/10.2.1 mvapich2/2.3.6 python/3.7.2" ;; "zen" | "zen2") # Corona - export CI_ROCM_VER="5.7.1" + #export CI_ROCM_VER="5.7.1" MODULE_CMD="module load StdEnv clang/14.0.6-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/${CI_ROCM_VER}" # MODULE_CMD="module load StdEnv gcc/10.3.1-magic openmpi/4.1.2 git/2.36.1 cmake/3.26.3 emacs/28.2 rocm/5.7.0" # ; ml use /opt/toss/modules/modulefiles && ml openmpi-gnu/4.1 @@ -222,6 +222,7 @@ set_center_specific_spack_dependencies() ;; "zen" | "zen2") # Corona # On LC the mvapich2 being used is built against HWLOC v1 + CI_ROCM_VER="5.7.1" CENTER_COMPILER="%rocmcc@${CI_ROCM_VER}" CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@${CI_ROCM_VER} ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" From d5639da288e179eb22be6fbd3d48f0de149fdaa6 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 2 Jul 2024 12:56:15 -0700 Subject: [PATCH 089/169] Back to 6.0.2 --- scripts/customize_build_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index df26c2213df..71dcb011cfb 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -222,7 +222,7 @@ set_center_specific_spack_dependencies() ;; "zen" | "zen2") # Corona # On LC the mvapich2 being used is built against HWLOC v1 - CI_ROCM_VER="5.7.1" + CI_ROCM_VER="6.0.2" CENTER_COMPILER="%rocmcc@${CI_ROCM_VER}" CENTER_DEPENDENCIES="^openmpi@4.1.2 ^hip@${CI_ROCM_VER} ^python@3.9.12 ^py-protobuf@4.21.5" CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" From 9a74dbe48ff35eda144c95f2ca6524948d86c6ef Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 2 Jul 2024 16:44:02 -0700 Subject: [PATCH 090/169] Trying a unified single pipeline for Pascal CI. --- .gitlab-ci.yml | 131 +++++++++++------------ .gitlab/pascal/single_pipeline.yml | 165 +++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 65 deletions(-) create mode 100644 .gitlab/pascal/single_pipeline.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0f6e76c95e4..b0cb5d3a59a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -36,27 +36,27 @@ variables: stages: - run-all-clusters -corona testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml +# corona testing: +# stage: run-all-clusters +# variables: +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/corona/pipeline.yml -corona distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" - SPACK_SPECS: "+rocm +distconv" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml +# corona distconv testing: +# stage: run-all-clusters +# variables: +# JOB_NAME_SUFFIX: _distconv +# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" +# SPACK_SPECS: "+rocm +distconv" +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# TEST_FLAG: "test_*_distconv.py" +# trigger: +# strategy: depend +# include: .gitlab/corona/pipeline.yml # lassen testing: # stage: run-all-clusters @@ -88,52 +88,53 @@ pascal testing: WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" trigger: strategy: depend - include: .gitlab/pascal/pipeline.yml + include: .gitlab/pascal/single_pipeline.yml +# include: .gitlab/pascal/pipeline.yml -pascal compiler testing: - stage: run-all-clusters - variables: - SPACK_SPECS: "+cuda +half +fft" -# SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" - BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/pascal/pipeline_compiler_tests.yml +# pascal compiler testing: +# stage: run-all-clusters +# variables: +# SPACK_SPECS: "+cuda +half +fft" +# # SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" +# BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/pascal/pipeline_compiler_tests.yml -pascal distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_SPECS: "+cuda +distconv +fft" -# SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" - BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/pascal/pipeline.yml +# pascal distconv testing: +# stage: run-all-clusters +# variables: +# JOB_NAME_SUFFIX: _distconv +# SPACK_SPECS: "+cuda +distconv +fft" +# # SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" +# BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# TEST_FLAG: "test_*_distconv.py" +# trigger: +# strategy: depend +# include: .gitlab/pascal/pipeline.yml -tioga testing: - stage: run-all-clusters - variables: - # FF_USE_NEW_BASH_EVAL_STRATEGY: 1 - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/tioga/pipeline.yml +# tioga testing: +# stage: run-all-clusters +# variables: +# # FF_USE_NEW_BASH_EVAL_STRATEGY: 1 +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/tioga/pipeline.yml -tioga distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" - SPACK_SPECS: "+rocm +distconv" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/tioga/pipeline.yml +# tioga distconv testing: +# stage: run-all-clusters +# variables: +# JOB_NAME_SUFFIX: _distconv +# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" +# SPACK_SPECS: "+rocm +distconv" +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# TEST_FLAG: "test_*_distconv.py" +# trigger: +# strategy: depend +# include: .gitlab/tioga/pipeline.yml diff --git a/.gitlab/pascal/single_pipeline.yml b/.gitlab/pascal/single_pipeline.yml new file mode 100644 index 00000000000..81a07205f93 --- /dev/null +++ b/.gitlab/pascal/single_pipeline.yml @@ -0,0 +1,165 @@ +################################################################################ +## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# This is the testing pipeline for the Pascal cluster at LLNL. This +# cluster builds the LBANN applications and libraries using a single +# compiler toolchain and then runs a collection of tests. Testing +# output is in JUnit format and parsed by the pipeline for web +# viewing. + +# See the Catalyst pipeline for more thorough documentation. + +include: + - .gitlab/common/common.yml + +stages: + - allocate + - build + - test + - deallocate + +# Get LC resources. +allocate lc resources: + stage: allocate + extends: + - .pascal common + - .lbann-base-vars + variables: + GIT_STRATEGY: none + script: + - echo "== ACQUIRING SLURM RESOURCES ==" + - echo "${WITH_WEEKLY:+Running with --weekly}" + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "120" || echo "90") + - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") + - salloc --exclusive -N ${LBANN_NNODES} -p pbatch -t ${TEST_TIME} --no-shell -J ${JOB_NAME} + timeout: 6h + +# Build LBANN and establish the Spack environment for this pipeline. +build and install: + extends: + - .pascal common + - .lbann-base-vars + - .lbann-artifacts + stage: build + script: + - echo "== BUILDING LBANN ==" + - export JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - !reference [.setup_spack, script] + - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} + -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies + --ci-pip -- + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib + - export TEST_TASKS_PER_NODE=2 + - export TEST_MPIBIND_FLAG="--mpibind=off" + - export SPACK_ARCH=$(spack arch) + - export SPACK_ARCH_TARGET=$(spack arch -t) + - !reference [.setup_lbann, script] + - .gitlab/common/run-catch-tests.sh +# + - echo "== SETUP SPACK ENVIRONMENT ==" + - export SPACK_USER_CACHE_PATH=${SPACK_USER_CACHE_PATH} + - source ${HOME}/${SPACK_REPO}/share/spack/setup-env.sh + - source spack-ci-env-name.sh + - ml use ${LBANN_MODFILES_DIR} + - ml load lbann + - echo "$(which lbann)" +# + - ([[ $(find ${RESULTS_DIR} -name "catch-tests-failed.txt" | wc -l) -eq 0 ]]) +# + - echo "== RUNNING PYTHON-BASED UNIT TESTS ==" + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - pushd ci_test/unit_tests + # - echo "Running unit tests with file pattern: ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} + - popd +# + - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + # - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - pushd ci_test/integration_tests + - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} + # - echo "Running integration tests with file pattern: ${TEST_FLAG}" + # - echo "lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} + - popd + artifacts: + when: always + paths: + - ci_test/unit_tests/results.xml + - ci_test/integration_tests/results.xml + reports: + junit: ${RESULTS_DIR}/*.xml + junit: ci_test/unit_tests/results.xml + junit: ci_test/integration_tests/results.xml + +# Cleanup the pipeline's Spack environment. +# Switching over to reusing Spack environments for each feature branch so don't remove them immediately +# Cleanup any build directories and spack environments older than 5 days since last use +remove spack environment: + extends: + - .pascal common + - .lbann-base-vars + - .cleanup old spack environment + stage: deallocate + variables: + GIT_STRATEGY: none + when: always + +# Free the allocation we obtained in "allocate lc resources". +release allocation: + stage: deallocate + extends: + - .pascal common + - .lbann-base-vars + variables: + GIT_STRATEGY: none + when: always + script: + - echo "== RELEASING RESOURCES ==" + - export JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - ([[ -n "${JOB_ID}" ]] && scancel ${JOB_ID}) + +# Variables for Pascal. +.pascal common: + variables: + # Just the obvious identifier. Which specific node doesn't matter. + SYSTEM_NAME: pascal + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git + + # These are system-specific specs that should be forwarded to the + # build script + SPACK_SPECS: "+cuda +half +fft" + + tags: + - pascal + - shell From 62d5becf2d777eac41ebd81174861e1e38e5d047 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 30 Jul 2024 13:31:14 -0700 Subject: [PATCH 091/169] Working on updating the CI builds to use a more direct script setup. --- .gitlab/build-and-test.sh | 243 +++++++++++++++++++++++++++++++++++++ .gitlab/configure_deps.sh | 76 ++++++++++++ .gitlab/run_catch_tests.sh | 102 ++++++++++++++++ .gitlab/setup_env.sh | 170 ++++++++++++++++++++++++++ 4 files changed, 591 insertions(+) create mode 100755 .gitlab/build-and-test.sh create mode 100644 .gitlab/configure_deps.sh create mode 100644 .gitlab/run_catch_tests.sh create mode 100644 .gitlab/setup_env.sh diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh new file mode 100755 index 00000000000..f309a9dfc14 --- /dev/null +++ b/.gitlab/build-and-test.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash + +################################################################################ +## Copyright 2019-2024 Lawrence Livermore National Security, LLC and other +## DiHydrogen Project Developers. See the top-level LICENSE file for details. +## +## SPDX-License-Identifier: Apache-2.0 +################################################################################ + +# Initialize modules for users not using bash as a default shell +modules_home=${MODULESHOME:-"/usr/share/lmod/lmod"} +if [[ -e ${modules_home}/init/bash ]] +then + source ${modules_home}/init/bash +fi + +set -o errexit +set -o nounset + +hostname="$(hostname)" +cluster=${hostname//[0-9]/} +project_dir="$(git rev-parse --show-toplevel)" +if [[ $? -eq 1 ]] +then + project_dir="$(pwd)" +fi + +# NOTE: No modules will be explicitly unloaded or purged. Obviously, +# loading a new compiler will trigger the auto-unload of the existing +# compiler module (and all the other side-effects wrt mpi, etc), but +# no explicit action is taken by this script. +modules=${MODULES:-""} +run_coverage=${WITH_COVERAGE:-""} +build_distconv=${WITH_DISTCONV:-""} + +job_unique_id=${CI_JOB_ID:-""} +prefix="" + +# Setup the module environment +if [[ -n "${modules}" ]] +then + echo "Loading modules: \"${modules}\"" + module load ${modules} +fi + +# Finish setting up the environment +source ${project_dir}/.gitlab/setup_env.sh + +# Make sure our working directory is something sane. +cd ${project_dir} + +# Create some temporary build space. +if [[ -z "${job_unique_id}" ]]; then + job_unique_id=manual_job_$(date +%F_%0H%0M) + while [[ -d ${prefix}-${job_unique_id} ]] ; do + sleep 1 + job_unique_id=manual_job_$(date +%F_%0H%0M) + done +fi +build_dir=${BUILD_DIR:-"${project_dir}/build-${job_unique_id}"} +mkdir -p ${build_dir} + +# Dependencies +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Build and test started" +echo "~~~~~ Start: $(date)" +echo "~~~~~ Host: ${hostname}" +echo "~~~~~ Project dir: ${project_dir}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" + +# Just for good measure... +export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/catch2:${prefix}/hwloc:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${prefix}/spdlog:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +echo "I am going to export the the prefix path ${CMAKE_PREFIX_PATH}" + +# Allow a user to force this +rebuild_deps=${REBUILD_DEPS:-""} + +# Rebuild if the prefix doesn't exist. +if [[ ! -d "${prefix}" ]] +then + rebuild_deps=1 +fi + +# Rebuild if latest hashes don't match +if [[ -z "${rebuild_deps}" ]] +then + function fetch-sha { + # $1 is the LLNL package name (e.g., 'aluminum') + # $2 is the branch name (e.g., 'master') + curl -s -H "Accept: application/vnd.github.VERSION.sha" \ + "https://api.github.com/repos/llnl/$1/commits/$2" + } + + al_head=$(fetch-sha aluminum master) + al_prebuilt="" + if [[ -f "${prefix}/al-prebuilt-hash.txt" ]] + then + al_prebuilt=$(cat ${prefix}/al-prebuilt-hash.txt) + fi + + h_head=$(fetch-sha elemental hydrogen) + h_prebuilt="" + if [[ -f "${prefix}/h-prebuilt-hash.txt" ]] + then + h_prebuilt=$(cat ${prefix}/h-prebuilt-hash.txt) + fi + + if [[ "${al_head}" != "${al_prebuilt}" ]] + then + echo "Prebuilt Aluminum hash does not match latest head; rebuilding." + echo " (prebuilt: ${al_prebuilt}; head: ${al_head})" + rebuild_deps=1 + fi + if [[ "${h_head}" != "${h_prebuilt}" ]] + then + echo "Prebuilt Hydrogen hash does not match latest head; rebuilding." + echo " (prebuilt: ${h_prebuilt}; head: ${h_head})" + rebuild_deps=1 + fi +fi + +if [[ -n "${rebuild_deps}" ]] +then + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Building Dependencies" + echo "~~~~~ Build dir: ${build_dir}" + echo "~~~~~ Install dir: ${prefix}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + # Get the superbuild because why not. + lbann_sb_top_dir=${build_dir}/sb + lbann_sb_dir=${lbann_sb_top_dir}/scripts/superbuild + mkdir -p ${lbann_sb_top_dir} + cd ${lbann_sb_top_dir} + + # Sparse checkout of the SuperBuild + git init + git remote add origin https://github.com/llnl/lbann + git fetch --depth=1 origin develop + git config core.sparseCheckout true + echo "scripts/superbuild" >> .git/info/sparse-checkout + git pull --ff-only origin develop + + cd ${build_dir} + # Uses "${cluster}", "${prefix}", and "${lbann_sb_dir}" + source ${project_dir}/.gitlab/configure_deps.sh + cmake --build build-deps + + # Stamp these commits + cd ${build_dir}/build-deps/aluminum/src && git rev-parse HEAD > ${prefix}/al-prebuilt-hash.txt + cd ${build_dir}/build-deps/hydrogen/src && git rev-parse HEAD > ${prefix}/h-prebuilt-hash.txt + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Dependencies Built" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +else + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Using Cached Dependencies" + echo "~~~~~ Prefix: ${prefix}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + for f in $(find ${prefix} -iname "*.pc"); + do + pfx=$(realpath $(dirname $(dirname $(dirname $f)))) + echo " >> Changing prefix in $(realpath $f) to: ${pfx}" + sed -i -e "s|^prefix=.*|prefix=${pfx}|g" $f + done +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Building LBANN" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +prefix=${build_dir}/install +cd ${build_dir} +source ${project_dir}/.gitlab/configure_lbann.sh +if ! cmake --build build-lbann ; +then + echo "ERROR: compilation failed, building with verbose output..." + cmake --build build-lbann --verbose -j 1 +else + ninja -C build-lbann install +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Testing LBANN" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles +echo "I think that the module is in ${LBANN_MODFILES_DIR}" +ml use ${LBANN_MODFILES_DIR} +ml load lbann +echo "$(which lbann)" + +failed_tests=0 +source ${project_dir}/.gitlab/run_catch_tests.sh + +source ${project_dir}/.gitlab/run_unit_and_integration_tests.sh + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ LBANN Tests Complete" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +if [[ "${run_coverage}" == "1" ]] +then + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Generating code coverage reports" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + # This is beyond obnoxious + gcovr_prefix=$(dirname $(dirname $(command -v gcovr))) + python_path=$(ls --color=no -1 -d ${gcovr_prefix}/lib/python*/site-packages) + echo "python_path=${python_path}" + PYTHONPATH=${python_path}:${PYTHONPATH} cmake --build build-h2 -t coverage + if [[ -e ${build_dir}/build-h2/coverage-gcovr.xml ]] + then + cp ${build_dir}/build-h2/coverage-gcovr.xml ${project_dir} + fi + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Generated code coverage reports" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Build and test completed" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +[[ "${failed_tests}" -eq 0 ]] && exit 0 || exit 1 diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh new file mode 100644 index 00000000000..7e9d908b4c4 --- /dev/null +++ b/.gitlab/configure_deps.sh @@ -0,0 +1,76 @@ +if [[ "$cluster" == "lassen" ]] +then + lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" +else + lapack_opt="" +fi + +cmake \ + -G Ninja \ + -S ${lbann_sb_dir} \ + -B ${build_dir}/build-deps \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${prefix} \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${common_linker_flags} \ + -D CMAKE_SHARED_LINKER_FLAGS=${common_linker_flags} \ + \ + -D CMAKE_BUILD_RPATH=${extra_rpaths//:/|} \ + -D CMAKE_INSTALL_RPATH=${extra_rpaths//:/|} \ + \ + -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ + -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_SKIP_BUILD_RPATH=OFF \ + -D CMAKE_SKIP_INSTALL_RPATH=OFF \ + -D CMAKE_SKIP_RPATH=OFF \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ + \ + -D LBANN_SB_BUILD_Aluminum=ON \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=ON \ + ${lapack_opt} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=ON \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + + + # -D LBANN_SB_BUILD_Catch2=ON \ + # -D LBANN_SB_Catch2_TAG="devel" \ + # \ + # -D LBANN_SB_BUILD_hwloc=${rocm_platform} \ + # -D LBANN_SB_BUILD_NCCL=${cuda_platform} \ + # -D LBANN_SB_BUILD_spdlog=ON \ + # \ diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh new file mode 100644 index 00000000000..fe5d81d7edb --- /dev/null +++ b/.gitlab/run_catch_tests.sh @@ -0,0 +1,102 @@ +# Run the sequential catch tests +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Sequential catch tests" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +timeout -k 1m 2m \ + ${build_dir}/build-lbann/unit_test/seq-catch-tests \ + -r console \ + -r JUnit::out=${project_dir}/seq-tests_junit.xml || { + failed_tests=$(( ${failed_tests} + $? )) + echo "******************************" + echo " >>> SeqCatchTests FAILED" + echo "******************************" +} + +# if [[ -e "${build_dir}/build-lbann/bin/GPUCatchTests" ]] +# then +# echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +# echo "~~~~~ GPU tests" +# echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +# timeout -k 1m 2m \ +# ${build_dir}/build-lbann/bin/GPUCatchTests \ +# -r console \ +# -r JUnit::out=${project_dir}/gpu-tests_junit.xml || { +# failed_tests=$(( ${failed_tests} + $? )) +# echo "******************************" +# echo " >>> GPUCatchTests FAILED" +# echo "******************************" +# } +# fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ MPI tests" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +# FIXME (trb 20240702): There's an issue with needing each process to +# write out its own coverage database file, and I just need to sort +# through that before this will work. So for now, just run these when +# not running coverage tests. +if [[ -z "${run_coverage}" ]] +then + case "${cluster}" in + pascal) + export OMPI_MCA_mpi_warn_on_fork=0 + timeout -k 1m 2m \ + srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + failed_tests=$((${failed_tests=} + $?)) + echo "******************************" + echo " >>> MPICatchTests FAILED" + echo "******************************" +# -r mpicumulative \ + } + ;; + lassen) + timeout -k 1m 2m \ + jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" +# -r mpicumulative \ + } + ;; + corona|tioga) + export H2_SELECT_DEVICE_0=1 + timeout -k 1m 2m \ + flux run -N1 -n8 -g1 --exclusive \ + --cwd=${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ + -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + # -r mpicumulative \ + } +# \ +# -o "${project_dir}/mpi-catch-filesystem-results-rank=%r-size=%s.xml" + timeout -k 1m 2m \ + flux run -N1 -n8 -g1 --exclusive \ + --cwd=${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ + -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests_junit.xml || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests [filesystem] FAILED" + echo "******************************" + # -r mpicumulative \ + } + ;; + *) + echo "Unknown cluster: ${cluster}" + ;; + esac +else + echo "NOTE: Skipping MPI tests." +fi diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh new file mode 100644 index 00000000000..98d2d75d2b3 --- /dev/null +++ b/.gitlab/setup_env.sh @@ -0,0 +1,170 @@ +# This is a collection of common variables and whatnot that may change +# based on the value of "${cluster}" or other variables. + +# To make things work with modules, the user can set "COMPILER_FAMILY" +# to "gnu", "clang", "amdclang", or "cray" and the suitable compiler +# paths will be deduced from the current PATH. Alternatively, users +# can set "CC"/"CXX" directly, in which case the +# "COMPILER_FAMILY" variable will be ignored. + +compiler_family=${COMPILER_FAMILY:-gnu} +case "${compiler_family,,}" in + gnu|gcc) + CC=${CC:-$(command -v gcc)} + CXX=${CXX:-$(command -v g++)} + ;; + clang) + CC=${CC:-$(command -v clang)} + CXX=${CXX:-$(command -v clang++)} + ;; + amdclang) + CC=${CC:-$(command -v amdclang)} + CXX=${CXX:-$(command -v amdclang++)} + ;; + cray) + CC=${CC:-$(command -v cc)} + CXX=${CXX:-$(command -v CC)} + ;; + *) + echo "Unknown compiler family: ${compiler_family}. Using gnu." + CC=${CC:-$(command -v gcc)} + CXX=${CXX:-$(command -v g++)} + ;; +esac + +# HIP/CUDA configuration and launcher are platform-specific +CUDACXX=${CUDACXX:=""} +CUDAHOSTCXX=${CUDAHOSTCXX:=${CXX}} + +cuda_platform=OFF +rocm_platform=OFF + +launcher=mpiexec + +common_linker_flags="-Wl,--disable-new-dtags" +extra_rpaths=${extra_rpaths:-""} + +# Set to the preferred install directory for the external dependencies +INSTALL_EXTERNALS_ROOT=/usr/workspace/lbann/ci_stable_dependencies/${cluster} + +case "${cluster}" in + pascal) + CUDACXX=${CUDACXX:-$(command -v nvcc)} + CUDAHOSTCXX=${CUDAHOSTCXX:-${CXX}} + cuda_platform=ON + gpu_arch=60 + launcher=slurm + CUDA_VER=cuda-11.8.0 + COMPILER_VER=clang-14.0.6-magic + SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/openmpi-4.1.2 + ;; + lassen) + CUDACXX=${CUDACXX:-$(command -v nvcc)} + CUDAHOSTCXX=${CUDAHOSTCXX:-${CXX}} + cuda_platform=ON + gpu_arch=70 + launcher=lsf + ;; + tioga) + cray_libs_dir=${CRAYLIBS_X86_64:-""} + if [[ -n "${cray_libs_dir}" ]] + then + extra_rpaths="${cray_libs_dir}:${ROCM_PATH}/lib:${extra_rpaths}" + else + extra_rpaths="${ROCM_PATH}/lib:${extra_rpaths}" + fi + rocm_platform=ON + gpu_arch=gfx90a + launcher=flux + ROCM_VER=$(basename ${ROCM_PATH}) + PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-8.1.29 + ;; + corona) + extra_rpaths="${ROCM_PATH}/lib:${extra_rpaths}" + rocm_platform=ON + gpu_arch=gfx906 + launcher=flux + ;; + *) + ;; +esac + +source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +#CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +CFLAGS=${CFLAGS:-""} +CXXFLAGS=${CXXFLAGS:-""} +LDFLAGS=${LDFLAGS:-""} +LDFLAGS="${common_linker_flags} ${LDFLAGS}" + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" + +# Update the location of external packages +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +# Set to ON to enable Half support. Only matters if building the +# LBANN stack. +BUILD_WITH_HALF=${BUILD_WITH_HALF:-OFF} +BUILD_WITH_DISTCONV=${BUILD_WITH_DISTCONV:-OFF} + +# Make sure the compilers and flags are exported +export CC CXX CUDACXX CUDAHOSTCXX CFLAGS CXXFLAGS LDFLAGS +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Environment Info" +echo "~~~~~" +echo "~~~~~ Cluster: ${cluster}" +echo "~~~~~ CUDA? ${cuda_platform}" +echo "~~~~~ ROCm? ${rocm_platform}" +echo "~~~~~ GPU arch: ${gpu_arch}" +echo "~~~~~ Launcher: ${launcher}" +echo "~~~~~" +echo "~~~~~ Compiler family: ${compiler_family}" +echo "~~~~~ CC: ${CC}" +echo "~~~~~ CXX: ${CXX}" +echo "~~~~~ CUDACXX: ${CUDACXX}" +echo "~~~~~ CUDAHOSTCXX: ${CUDAHOSTCXX}" +echo "~~~~~" +echo "~~~~~ CFLAGS: ${CFLAGS}" +echo "~~~~~ CXXFLAGS: ${CXXFLAGS}" +echo "~~~~~ LDFLAGS: ${LDFLAGS}" +echo "~~~~~ Extra rpaths: ${extra_rpaths}" +echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +# Handle cuDNN +if [[ "${cuda_platform}" == "ON" ]] +then + cuda_maj_version=$(basename ${CUDA_HOME} | grep -E --color=no -o "[0-9]+\.[0-9]+\.[0-9]+" | cut -d '.' -f 1) + arch=$(uname -m) + cudnn_root=$(ls -1 -d /usr/workspace/brain/cudnn/cudnn-*/cuda_${cuda_maj_version}_${arch} | tail -1) + if [[ -z "${cudnn_root}" ]] + then + echo "WARNING: No suitable cuDNN found." + else + CMAKE_PREFIX_PATH=${cudnn_root}:${CMAKE_PREFIX_PATH:-""} + fi +fi + +# Get Breathe, gcovr, and Ninja. Putting this off to the side because +# I don't want to tweak "the real" python environment, but it's just +# these one or two things so it's not worth a venv. +if [[ -n "${run_coverage}" ]] +then + python_pkgs="ninja gcovr" +else + python_pkgs="ninja" +fi + +export PYTHONUSERBASE=${TMPDIR}/${USER}/python/${cluster} +export PATH=${PYTHONUSERBASE}/bin:${PATH} +python3 -m pip install --user ${python_pkgs} + +# Make sure the PYTHONPATH is all good. +export PYTHONPATH=$(ls --color=no -1 -d ${PYTHONUSERBASE}/lib/python*/site-packages | paste -sd ":" - ):${PYTHONPATH:-""} From 3972df77bb7f3d85a792a903dc955d8e6299d742 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 30 Jul 2024 13:32:57 -0700 Subject: [PATCH 092/169] Added configure scripts for LBANN and a script to run the unit and integration tests. --- .gitlab/configure_lbann.sh | 62 +++++++++++++ .gitlab/run_unit_and_integration_tests.sh | 108 ++++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 .gitlab/configure_lbann.sh create mode 100644 .gitlab/run_unit_and_integration_tests.sh diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh new file mode 100644 index 00000000000..46b7e9ad0dd --- /dev/null +++ b/.gitlab/configure_lbann.sh @@ -0,0 +1,62 @@ +if [[ "$cluster" == "lassen" ]] +then + lapack_opt="-D BLA_VENDOR=Generic" +else + lapack_opt="" +fi + +cmake -G Ninja \ + -S ${project_dir} \ + -B ${build_dir}/build-lbann \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${prefix}/lbann \ + \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/\;}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/\;}" \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -D CMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D CMAKE_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_DATATYPE=float \ + -D LBANN_WITH_CALIPER=OFF \ + -D LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ + -D LBANN_WITH_TBINF=OFF \ + -D LBANN_WITH_UNIT_TESTING=ON \ + -D LBANN_WITH_CNPY=ON \ + -D LBANN_DETERMINISTIC=ON \ + -D LBANN_WITH_ADDRESS_SANITIZER=OFF \ + -D LBANN_WITH_FFT=OFF \ + -D LBANN_WITH_EMBEDDED_PYTHON=ON \ + -D LBANN_WITH_PYTHON_FRONTEND=ON \ + -D LBANN_WITH_VISION=ON + + +# -D LBANN_WITH_CUTENSOR=OFF \ + +# \ +# -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} + +# -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ + + # -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ + # -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ + # -D LBANN_WITH_NVSHMEM=OFF \ + + # \ + # -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh new file mode 100644 index 00000000000..328d9ac1eb1 --- /dev/null +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -0,0 +1,108 @@ +#!/bin/bash -l + +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +LBANN_DIR=$(git rev-parse --show-toplevel) + +cd ${LBANN_DIR}/ci_test + +echo "${PWD}/run.sh CLUSTER=${CLUSTER}" + +PYTHON=python3 +LBANN_PYTHON=lbann_pfe.sh + +# WEEKLY=0 +# while :; do +# case ${1} in +# --weekly) +# # Run all tests. This is a weekly build. +# echo "Setting WEEKLY in run.sh" +# WEEKLY=1 +# ;; +# -?*) +# # Unknown option +# echo "Unknown option (${1})" >&2 +# exit 1 +# ;; +# *) +# # Break loop if there are no more options +# break +# esac +# shift +# done + +# # Use the spack provided by the CI +# source ${HOME}/${SPACK_REPO}/share/spack/setup-env.sh + +# # "spack" is just a shell function; it may not be exported to this +# # scope. Just to be sure, reload the shell integration. +# if [ -n "${SPACK_ROOT}" ]; then +# source ${SPACK_ROOT}/share/spack/setup-env.sh +# else +# echo "Spack required. Please set SPACK_ROOT environment variable" +# exit 1 +# fi + +# SPACK_VERSION=$(spack --version | sed 's/-.*//g' | sed 's/[(].*[)]//g') +# MIN_SPACK_VERSION=0.18.0 + +# source ${LBANN_DIR}/scripts/utilities.sh + +# compare_versions ${SPACK_VERSION} ${MIN_SPACK_VERSION} +# VALID_SPACK=$? + +# if [[ ${VALID_SPACK} -eq 2 ]]; then +# echo "Newer version of Spack required. Detected version ${SPACK_VERSION} requires at least ${MIN_SPACK_VERSION}" +# exit 1 +# fi + +# echo "run.sh WEEKLY=" +# echo $WEEKLY + +# echo "Task: Cleaning" +# ./clean.sh + +# echo "Discovered installed module file: ${LBANN_MODFILES_DIR}" +# echo "Discovered Spack environment: ${SPACK_ENV_NAME}" +# echo "Task: Compiler Tests" +# cd compiler_tests +# $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || exit 1 + +# Find the correct module to load +# SPACK_ARCH=$(spack arch) +# SPACK_ARCH_TARGET=$(spack arch -t) +# export LBANN_BUILD_LABEL="lbann_${SYSTEM_NAME}_${SPACK_ENV_NAME}-${SPACK_ARCH_TARGET}" +# export LBANN_BUILD_PARENT_DIR="${CI_PROJECT_DIR}/builds/${LBANN_BUILD_LABEL}" +# export LBANN_INSTALL_DIR="${LBANN_BUILD_PARENT_DIR}/install" +# export LBANN_MODFILES_DIR="${LBANN_INSTALL_DIR}/etc/modulefiles" +# ml use ${LBANN_MODFILES_DIR} +# ml load lbann + +echo "Testing $(which lbann) from $(pwd) +cd unit_tests" +#cd .. + +flux run -N1 -n8 -g1 --exclusive lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml + +exit 0 +# These tests are "allowed" to fail inside the script. That is, the +# unit tests should be run even if these fail. The status is cached +# for now. +echo "Task: Integration Tests" +cd integration_tests +if [ ${WEEKLY} -ne 0 ]; then + $LBANN_PYTHON -m pytest -s -vv --durations=0 --weekly --junitxml=results.xml + status=$? +else + $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml + status=$? +fi +cd .. + +echo "Task: Unit Tests" +cd unit_tests +OMP_NUM_THREADS=10 $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +status=$(($status + $?)) +cd .. + +echo "Task: Finished" +exit $status From 661b5119fedf60714eba732eb5d469609d8dd7c4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 12:03:02 -0700 Subject: [PATCH 093/169] Cleaning up the CI scripts. --- .gitlab/build-and-test.sh | 39 +++++++---------------- .gitlab/run_catch_tests.sh | 1 + .gitlab/run_unit_and_integration_tests.sh | 33 +++++++++++-------- 3 files changed, 32 insertions(+), 41 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index f309a9dfc14..3b80fba677d 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -191,15 +191,24 @@ else fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ Testing LBANN" +echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm" +echo ${CMD} +${CMD} + LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles -echo "I think that the module is in ${LBANN_MODFILES_DIR}" +#echo "I think that the module is in ${LBANN_MODFILES_DIR}" ml use ${LBANN_MODFILES_DIR} ml load lbann -echo "$(which lbann)" +#echo "$(which lbann)" + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Testing LBANN: $(which lbann)" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" failed_tests=0 source ${project_dir}/.gitlab/run_catch_tests.sh @@ -211,30 +220,6 @@ echo "~~~~~ LBANN Tests Complete" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -if [[ "${run_coverage}" == "1" ]] -then - - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Generating code coverage reports" - echo "~~~~~ $(date)" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - - # This is beyond obnoxious - gcovr_prefix=$(dirname $(dirname $(command -v gcovr))) - python_path=$(ls --color=no -1 -d ${gcovr_prefix}/lib/python*/site-packages) - echo "python_path=${python_path}" - PYTHONPATH=${python_path}:${PYTHONPATH} cmake --build build-h2 -t coverage - if [[ -e ${build_dir}/build-h2/coverage-gcovr.xml ]] - then - cp ${build_dir}/build-h2/coverage-gcovr.xml ${project_dir} - fi - - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Generated code coverage reports" - echo "~~~~~ $(date)" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -fi - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Build and test completed" echo "~~~~~ $(date)" diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index fe5d81d7edb..c28a123be48 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -45,6 +45,7 @@ then export OMPI_MCA_mpi_warn_on_fork=0 timeout -k 1m 2m \ srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ + -D ${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { failed_tests=$((${failed_tests=} + $?)) diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 328d9ac1eb1..7337e27c2ca 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -77,30 +77,35 @@ LBANN_PYTHON=lbann_pfe.sh # ml use ${LBANN_MODFILES_DIR} # ml load lbann -echo "Testing $(which lbann) from $(pwd) -cd unit_tests" -#cd .. +# cd unit_tests +# echo "Testing $(which lbann) from $(pwd)" + +case "${cluster}" in + pascal) + export OMPI_MCA_mpi_warn_on_fork=0 + ;; + lassen) + ;; + corona|tioga) + export H2_SELECT_DEVICE_0=1 + ;; + *) + echo "Unknown cluster: ${cluster}" + ;; +esac -flux run -N1 -n8 -g1 --exclusive lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml - -exit 0 # These tests are "allowed" to fail inside the script. That is, the # unit tests should be run even if these fail. The status is cached # for now. echo "Task: Integration Tests" cd integration_tests -if [ ${WEEKLY} -ne 0 ]; then - $LBANN_PYTHON -m pytest -s -vv --durations=0 --weekly --junitxml=results.xml - status=$? -else - $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml - status=$? -fi +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +status=$? cd .. echo "Task: Unit Tests" cd unit_tests -OMP_NUM_THREADS=10 $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml status=$(($status + $?)) cd .. From 0f180d2edb2dbbdefeb4d3b1112a5bfc33f0c59f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 12:15:29 -0700 Subject: [PATCH 094/169] Added GitLab CI yaml files. --- .gitlab-ci.yml | 30 ++++++++++----- .gitlab/build-and-test-common.yml | 64 +++++++++++++++++++++++++++++++ .gitlab/build-and-test-tioga.yml | 49 +++++++++++++++++++++++ 3 files changed, 134 insertions(+), 9 deletions(-) create mode 100644 .gitlab/build-and-test-common.yml create mode 100644 .gitlab/build-and-test-tioga.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b0cb5d3a59a..bce0a519c3c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -28,6 +28,10 @@ # clusters. To run testing locally, consult the README in the ci_test # directory. +include: + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' + variables: FF_USE_NEW_BASH_EVAL_STRATEGY: 'true' FF_ENABLE_BASH_EXIT_CODE_CHECK: 1 @@ -81,15 +85,15 @@ stages: # strategy: depend # include: .gitlab/lassen/multi_stage_pipeline.yml -pascal testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/pascal/single_pipeline.yml -# include: .gitlab/pascal/pipeline.yml +# pascal testing: +# stage: run-all-clusters +# variables: +# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" +# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" +# trigger: +# strategy: depend +# include: .gitlab/pascal/single_pipeline.yml +# # include: .gitlab/pascal/pipeline.yml # pascal compiler testing: # stage: run-all-clusters @@ -138,3 +142,11 @@ pascal testing: # trigger: # strategy: depend # include: .gitlab/tioga/pipeline.yml + +tioga testing: + stage: run-all-clusters + trigger: + strategy: depend + include: '.gitlab/build-and-test-tioga.yml' + forward: + pipeline_variables: true diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml new file mode 100644 index 00000000000..beced4237ed --- /dev/null +++ b/.gitlab/build-and-test-common.yml @@ -0,0 +1,64 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +.build-and-test-base: + variables: + LLNL_SERVICE_USER: lbannusr + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N1 -t 90" + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 120m" + LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 1 -W 60" + GIT_SUBMODULE_STRATEGY: recursive + GIT_DEPTH: 5 + script: + - printenv > ${CI_PROJECT_DIR}/ci_environment.log + - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh + cache: + key: $CI_JOB_NAME_SLUG + paths: + - install-deps-${CI_JOB_NAME_SLUG} + +.build-and-test: + artifacts: + when: always + paths: + - "${CI_PROJECT_DIR}/*junit.*xml" + - "${CI_PROJECT_DIR}/ci_environment.log" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" + reports: + junit: "${CI_PROJECT_DIR}/*junit.*xml" + extends: .build-and-test-base + +.build-and-test-coverage: + artifacts: + when: always + paths: + - "${CI_PROJECT_DIR}/*junit.*xml" + - "${CI_PROJECT_DIR}/ci_environment.log" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/coverage/" + reports: + junit: "${CI_PROJECT_DIR}/*junit.*xml" + extends: .build-and-test-base diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml new file mode 100644 index 00000000000..c92dffac13c --- /dev/null +++ b/.gitlab/build-and-test-tioga.yml @@ -0,0 +1,49 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +rocm-5-7-1-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-amd amd/5.7.1 rocm/5.7.1 cray-mpich cray-libsci" + extends: .build-and-test-on-tioga + +.build-and-test-on-tioga: + stage: build + tags: [tioga, batch] + variables: + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 20 -q pci" + extends: .build-and-test From 4771fec9abc33792962a84bde783142c7960fc05 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 12:28:01 -0700 Subject: [PATCH 095/169] Lowered the git depth. --- .gitlab/build-and-test-common.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index beced4237ed..e564fd1f6d0 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -31,7 +31,7 @@ LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 120m" LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 1 -W 60" GIT_SUBMODULE_STRATEGY: recursive - GIT_DEPTH: 5 + GIT_DEPTH: 1 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh From 84aa7728f4a05684d2bd49f5d4b38b9871955564 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 12:54:45 -0700 Subject: [PATCH 096/169] Fix the submodule strategy. --- .gitlab/build-and-test-common.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index e564fd1f6d0..beb4b4392ef 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -30,8 +30,8 @@ LLNL_SLURM_SCHEDULER_PARAMETERS: "-N1 -t 90" LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 120m" LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 1 -W 60" - GIT_SUBMODULE_STRATEGY: recursive - GIT_DEPTH: 1 + GIT_SUBMODULE_STRATEGY: none + GIT_DEPTH: 5 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh From 20281ca7dfea23d057c8adc0a2cb32f55bf71c66 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 13:18:13 -0700 Subject: [PATCH 097/169] Fixed the CI tests to use 2 nodes. Better error handling. --- .gitlab/build-and-test-common.yml | 6 ++-- .gitlab/build-and-test-tioga.yml | 2 +- .gitlab/run_catch_tests.sh | 40 +++++++++++++++++++---- .gitlab/run_unit_and_integration_tests.sh | 12 +++++-- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index beb4b4392ef..ce1fed06afd 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -27,9 +27,9 @@ .build-and-test-base: variables: LLNL_SERVICE_USER: lbannusr - LLNL_SLURM_SCHEDULER_PARAMETERS: "-N1 -t 90" - LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 120m" - LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 1 -W 60" + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N2 -t 90" + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120m" + LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 5 script: diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index c92dffac13c..bee1c13b115 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -45,5 +45,5 @@ rocm-5-7-1-tioga: stage: build tags: [tioga, batch] variables: - LLNL_FLUX_SCHEDULER_PARAMETERS: "-N1 -t 20 -q pci" + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pci" extends: .build-and-test diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index c28a123be48..c4779809145 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -46,26 +46,52 @@ then timeout -k 1m 2m \ srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ -D ${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ - -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \\ + -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { failed_tests=$((${failed_tests=} + $?)) echo "******************************" echo " >>> MPICatchTests FAILED" echo "******************************" # -r mpicumulative \ } + timeout -k 1m 2m \ + srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ + -D ${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ + -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests [filesystem] FAILED" + echo "******************************" + # -r mpicumulative \ + } ;; lassen) timeout -k 1m 2m \ jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ - -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ + -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { failed_tests=$((${failed_tests} + $?)) echo "******************************" echo " >>> mpi-catch-tests FAILED" echo "******************************" # -r mpicumulative \ } + timeout -k 1m 2m \ + jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ + -h ${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ + -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests [filesystem] FAILED" + echo "******************************" + # -r mpicumulative \ + } ;; corona|tioga) export H2_SELECT_DEVICE_0=1 @@ -73,7 +99,8 @@ then flux run -N1 -n8 -g1 --exclusive \ --cwd=${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ - -r JUnit::out=${project_dir}/mpi-tests_junit.xml || { + -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { failed_tests=$((${failed_tests} + $?)) echo "******************************" echo " >>> mpi-catch-tests FAILED" @@ -86,7 +113,8 @@ then flux run -N1 -n8 -g1 --exclusive \ --cwd=${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ - -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests_junit.xml || { + -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { failed_tests=$((${failed_tests} + $?)) echo "******************************" echo " >>> mpi-catch-tests [filesystem] FAILED" diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 7337e27c2ca..9b3161eeba4 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -99,13 +99,21 @@ esac # for now. echo "Task: Integration Tests" cd integration_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || { + echo "******************************" + echo " >>> Integration Tests FAILED" + echo "******************************" +} status=$? cd .. echo "Task: Unit Tests" cd unit_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || { + echo "******************************" + echo " >>> Unit Tests FAILED" + echo "******************************" +} status=$(($status + $?)) cd .. From 60e264caeaed5d7981f3ef117a5bc04dfd1d8013 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 31 Jul 2024 15:51:55 -0700 Subject: [PATCH 098/169] Fixed the name of the test result files so that they would be picked up by CI. --- .gitlab/run_unit_and_integration_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 9b3161eeba4..67b54ea1252 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -99,7 +99,7 @@ esac # for now. echo "Task: Integration Tests" cd integration_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || { +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml || { echo "******************************" echo " >>> Integration Tests FAILED" echo "******************************" @@ -109,7 +109,7 @@ cd .. echo "Task: Unit Tests" cd unit_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || { +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml || { echo "******************************" echo " >>> Unit Tests FAILED" echo "******************************" From 45641125afe4a74a167e2409ff538271416f6b93 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 08:57:54 -0700 Subject: [PATCH 099/169] Added a test pascal pipeline. --- .gitlab-ci.yml | 8 ++++++ .gitlab/build-and-test-pascal.yml | 46 +++++++++++++++++++++++++++++++ .gitlab/build-and-test-tioga.yml | 2 +- 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 .gitlab/build-and-test-pascal.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bce0a519c3c..8b62bd72900 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -150,3 +150,11 @@ tioga testing: include: '.gitlab/build-and-test-tioga.yml' forward: pipeline_variables: true + +pascal testing: + stage: run-all-clusters + trigger: + strategy: depend + include: '.gitlab/build-and-test-pascal.yml' + forward: + pipeline_variables: true diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml new file mode 100644 index 00000000000..165654fa4b2 --- /dev/null +++ b/.gitlab/build-and-test-pascal.yml @@ -0,0 +1,46 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +clang-14-0-6-cuda-11-8-0-pascal: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" + extends: [.build-and-test-on-pascal, .build-and-test] + +.build-and-test-on-pascal: + stage: build + tags: [pascal, batch] diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index bee1c13b115..2f1836d4c0f 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -45,5 +45,5 @@ rocm-5-7-1-tioga: stage: build tags: [tioga, batch] variables: - LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pci" + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pdebug" extends: .build-and-test From 396dbdb250b544a8fe8ed7d2e46d48cd21db8f5f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 10:07:38 -0700 Subject: [PATCH 100/169] Fixed how the DistConv flag is propagated. --- .gitlab/build-and-test-tioga.yml | 7 +++++++ .gitlab/build-and-test.sh | 25 ++++++++++++++++++++++--- .gitlab/configure_deps.sh | 2 +- .gitlab/configure_lbann.sh | 2 +- .gitlab/setup_env.sh | 3 +++ CMakeLists.txt | 2 +- LICENSE | 2 +- 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 2f1836d4c0f..6931081dd0d 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -41,6 +41,13 @@ rocm-5-7-1-tioga: MODULES: "PrgEnv-amd amd/5.7.1 rocm/5.7.1 cray-mpich cray-libsci" extends: .build-and-test-on-tioga +rocm-5-7-1-distconv-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-amd amd/5.7.1 rocm/5.7.1 cray-mpich cray-libsci" + WITH_DISTCONV: "1" + extends: .build-and-test-on-tioga + .build-and-test-on-tioga: stage: build tags: [tioga, batch] diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 3b80fba677d..3fb89b22c64 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -1,10 +1,29 @@ #!/usr/bin/env bash ################################################################################ -## Copyright 2019-2024 Lawrence Livermore National Security, LLC and other -## DiHydrogen Project Developers. See the top-level LICENSE file for details. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. ## -## SPDX-License-Identifier: Apache-2.0 +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. ################################################################################ # Initialize modules for users not using bash as a default shell diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 7e9d908b4c4..ce3d1cae904 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -63,7 +63,7 @@ cmake \ -D LBANN_SB_BUILD_DiHydrogen=ON \ -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${build_distconv:-OFF} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 46b7e9ad0dd..6ca09dc4e00 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -35,7 +35,7 @@ cmake -G Ninja \ -D CMAKE_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_DATATYPE=float \ -D LBANN_WITH_CALIPER=OFF \ - -D LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ + -D LBANN_WITH_DISTCONV=${build_distconv:-OFF} \ -D LBANN_WITH_TBINF=OFF \ -D LBANN_WITH_UNIT_TESTING=ON \ -D LBANN_WITH_CNPY=ON \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 98d2d75d2b3..691aa0a3e07 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -85,6 +85,9 @@ case "${cluster}" in rocm_platform=ON gpu_arch=gfx906 launcher=flux + ROCM_VER=$(basename ${ROCM_PATH}) + COMPILER_VER=clang-14.0.6-magic + SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${COMPILER_VER}/openmpi-4.1.2 ;; *) ;; diff --git a/CMakeLists.txt b/CMakeLists.txt index 11f7f06fb23..3a4a51a5d4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ################################################################################ -## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. ## Produced at the Lawrence Livermore National Laboratory. ## Written by the LBANN Research Team (B. Van Essen, et al.) listed in ## the CONTRIBUTORS file. diff --git a/LICENSE b/LICENSE index 68681ede2ee..5d854378863 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. Written by the LBANN Research Team (B. Van Essen, et al.) listed in the CONTRIBUTORS file. From 18bf20d042306ed6f62ab16aa5538b9e8baa51a7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 11:50:49 -0700 Subject: [PATCH 101/169] Added external flags for building with HALF and FFT support. Limited the distconv builds to only run the right tests. --- .gitlab/build-and-test-common.yml | 1 + .gitlab/build-and-test-pascal.yml | 2 ++ .gitlab/build-and-test.sh | 7 +++++++ .gitlab/configure_deps.sh | 2 +- .gitlab/configure_lbann.sh | 2 +- .gitlab/run_unit_and_integration_tests.sh | 20 ++++++++++++-------- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index ce1fed06afd..0a25e359748 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -39,6 +39,7 @@ key: $CI_JOB_NAME_SLUG paths: - install-deps-${CI_JOB_NAME_SLUG} + timeout: 6h .build-and-test: artifacts: diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 165654fa4b2..576923d3213 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -39,6 +39,8 @@ clang-14-0-6-cuda-11-8-0-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" + WITH_HALF: "1" + WITH_FFT: "1" extends: [.build-and-test-on-pascal, .build-and-test] .build-and-test-on-pascal: diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 3fb89b22c64..430544950ab 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -51,6 +51,13 @@ fi modules=${MODULES:-""} run_coverage=${WITH_COVERAGE:-""} build_distconv=${WITH_DISTCONV:-""} +build_half=${WITH_HALF:-""} +build_fft=${WITH_FFT:-""} + +TEST_FLAG=${WITH_DISTCONV:-""} +if [[ ${WITH_DISTCONV} ]]; then + TEST_FLAG="test_*_distconv.py" +fi job_unique_id=${CI_JOB_ID:-""} prefix="" diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index ce3d1cae904..eb67026d408 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -55,7 +55,7 @@ cmake \ ${lapack_opt} \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 6ca09dc4e00..b32ffeedef5 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -41,7 +41,7 @@ cmake -G Ninja \ -D LBANN_WITH_CNPY=ON \ -D LBANN_DETERMINISTIC=ON \ -D LBANN_WITH_ADDRESS_SANITIZER=OFF \ - -D LBANN_WITH_FFT=OFF \ + -D LBANN_WITH_FFT=${build_fft:-OFF} \ -D LBANN_WITH_EMBEDDED_PYTHON=ON \ -D LBANN_WITH_PYTHON_FRONTEND=ON \ -D LBANN_WITH_VISION=ON diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 67b54ea1252..d6ed3f148e6 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -94,28 +94,32 @@ case "${cluster}" in ;; esac +export OMP_NUM_THREADS=10 + # These tests are "allowed" to fail inside the script. That is, the # unit tests should be run even if these fail. The status is cached # for now. -echo "Task: Integration Tests" +echo "Task: Integration Tests with file pattern: ${TEST_FLAG}" cd integration_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml || { +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { + status=$? + failed_tests=$(( ${failed_tests} + $? )) echo "******************************" echo " >>> Integration Tests FAILED" echo "******************************" } -status=$? cd .. -echo "Task: Unit Tests" +echo "Task: Unit Tests with file pattern: ${TEST_FLAG}" cd unit_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml || { +$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml ${TEST_FLAG} || { + status=$(($status + $?)) + failed_tests=$(( ${failed_tests} + $? )) echo "******************************" echo " >>> Unit Tests FAILED" echo "******************************" } -status=$(($status + $?)) cd .. -echo "Task: Finished" -exit $status +echo "Task: Finished with status ${status}" +#exit $status From 9c9119fbd94134c41be3341fe427ac38cf64bad3 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 11:55:21 -0700 Subject: [PATCH 102/169] Cleaning up code. --- .gitlab/configure_deps.sh | 18 ++++++------------ .gitlab/configure_lbann.sh | 22 ---------------------- 2 files changed, 6 insertions(+), 34 deletions(-) diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index eb67026d408..f0f43b3ff10 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -1,8 +1,10 @@ if [[ "$cluster" == "lassen" ]] then - lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" + hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" + dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" else - lapack_opt="" + hydrogen_lapack_opt="" + dihydrogen_lapack_opt="" fi cmake \ @@ -52,7 +54,7 @@ cmake \ -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ \ -D LBANN_SB_BUILD_Hydrogen=ON \ - ${lapack_opt} \ + ${hydrogen_lapack_opt} \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ @@ -61,16 +63,8 @@ cmake \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ \ -D LBANN_SB_BUILD_DiHydrogen=ON \ + ${dihydrogen_lapack_opt} \ -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${build_distconv:-OFF} \ -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - - - # -D LBANN_SB_BUILD_Catch2=ON \ - # -D LBANN_SB_Catch2_TAG="devel" \ - # \ - # -D LBANN_SB_BUILD_hwloc=${rocm_platform} \ - # -D LBANN_SB_BUILD_NCCL=${cuda_platform} \ - # -D LBANN_SB_BUILD_spdlog=ON \ - # \ diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index b32ffeedef5..d8e5462b618 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,10 +1,3 @@ -if [[ "$cluster" == "lassen" ]] -then - lapack_opt="-D BLA_VENDOR=Generic" -else - lapack_opt="" -fi - cmake -G Ninja \ -S ${project_dir} \ -B ${build_dir}/build-lbann \ @@ -45,18 +38,3 @@ cmake -G Ninja \ -D LBANN_WITH_EMBEDDED_PYTHON=ON \ -D LBANN_WITH_PYTHON_FRONTEND=ON \ -D LBANN_WITH_VISION=ON - - -# -D LBANN_WITH_CUTENSOR=OFF \ - -# \ -# -D CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ - - # -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ - # -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ - # -D LBANN_WITH_NVSHMEM=OFF \ - - # \ - # -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ From 07f9fce4dde32bd0b9574b1b716598f7c0cd5989 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 12:05:21 -0700 Subject: [PATCH 103/169] Added distconv pascal test. --- .gitlab/build-and-test-pascal.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 576923d3213..766aafa7d5e 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -43,6 +43,15 @@ clang-14-0-6-cuda-11-8-0-pascal: WITH_FFT: "1" extends: [.build-and-test-on-pascal, .build-and-test] +clang-14-0-6-cuda-11-8-0-distconv-pascal: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" + WITH_HALF: "1" + WITH_FFT: "1" + WITH_DISTCONV: "1" + extends: [.build-and-test-on-pascal, .build-and-test] + .build-and-test-on-pascal: stage: build tags: [pascal, batch] From e84fb10e3a3bf211f4e0a13c43ae61c85231ddb5 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 13:07:25 -0700 Subject: [PATCH 104/169] Fix the status capture. --- .gitlab/run_unit_and_integration_tests.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index d6ed3f148e6..dea8f2d774c 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -102,8 +102,9 @@ export OMP_NUM_THREADS=10 echo "Task: Integration Tests with file pattern: ${TEST_FLAG}" cd integration_tests $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { - status=$? - failed_tests=$(( ${failed_tests} + $? )) + this_status=$? + status=$(( $status + $this_status )) + failed_tests=$(( $failed_tests + $this_status )) echo "******************************" echo " >>> Integration Tests FAILED" echo "******************************" @@ -113,8 +114,9 @@ cd .. echo "Task: Unit Tests with file pattern: ${TEST_FLAG}" cd unit_tests $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml ${TEST_FLAG} || { - status=$(($status + $?)) - failed_tests=$(( ${failed_tests} + $? )) + this_status=$? + status=$(( $status + $this_status )) + failed_tests=$(( $failed_tests + $this_status )) echo "******************************" echo " >>> Unit Tests FAILED" echo "******************************" From f8d37fd8f46b660c10acf404cccff2de3413d1f3 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 13:17:37 -0700 Subject: [PATCH 105/169] Fixed logic bug in bash. --- .gitlab/build-and-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 430544950ab..f1cde420070 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -55,7 +55,7 @@ build_half=${WITH_HALF:-""} build_fft=${WITH_FFT:-""} TEST_FLAG=${WITH_DISTCONV:-""} -if [[ ${WITH_DISTCONV} ]]; then +if [[ ${build_distconv} ]]; then TEST_FLAG="test_*_distconv.py" fi From adc008fcfdbbf3e33a5fe71fe917be078397a2b5 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 13:43:59 -0700 Subject: [PATCH 106/169] Fixed the include path to Half and disabled FFT --- .gitlab/build-and-test-pascal.yml | 4 ++-- .gitlab/setup_env.sh | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 766aafa7d5e..1bd754dd55d 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -40,7 +40,7 @@ clang-14-0-6-cuda-11-8-0-pascal: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" WITH_HALF: "1" - WITH_FFT: "1" + WITH_FFT: "0" extends: [.build-and-test-on-pascal, .build-and-test] clang-14-0-6-cuda-11-8-0-distconv-pascal: @@ -48,7 +48,7 @@ clang-14-0-6-cuda-11-8-0-distconv-pascal: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" WITH_HALF: "1" - WITH_FFT: "1" + WITH_FFT: "0" WITH_DISTCONV: "1" extends: [.build-and-test-on-pascal, .build-and-test] diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 691aa0a3e07..a90fef3165c 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -45,7 +45,8 @@ common_linker_flags="-Wl,--disable-new-dtags" extra_rpaths=${extra_rpaths:-""} # Set to the preferred install directory for the external dependencies -INSTALL_EXTERNALS_ROOT=/usr/workspace/lbann/ci_stable_dependencies/${cluster} +CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies +INSTALL_EXTERNALS_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/${cluster} case "${cluster}" in pascal) @@ -94,6 +95,7 @@ case "${cluster}" in esac source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh +export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From a11bd664d689cf88fcd6fd10518e3f0bbc3921dd Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 15:00:37 -0700 Subject: [PATCH 107/169] Fixed the failed test reporting and that distconv and half don't play together. --- .gitlab/build-and-test-pascal.yml | 2 +- .gitlab/run_unit_and_integration_tests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 1bd754dd55d..47a577ff4e7 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -47,7 +47,7 @@ clang-14-0-6-cuda-11-8-0-distconv-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" - WITH_HALF: "1" + WITH_HALF: "0" WITH_FFT: "0" WITH_DISTCONV: "1" extends: [.build-and-test-on-pascal, .build-and-test] diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index dea8f2d774c..4e5edacda7b 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -123,5 +123,5 @@ $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_r } cd .. -echo "Task: Finished with status ${status}" +echo "Task: Finished with ${failed_tests} failed tests" #exit $status From ba047864deb1c60ff22d0c9ab964958e4a26891b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 16:05:40 -0700 Subject: [PATCH 108/169] Extend the mpi catch tests time limit. --- .gitlab/run_catch_tests.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index c4779809145..84fe5a29276 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -3,7 +3,7 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Sequential catch tests" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -timeout -k 1m 2m \ +timeout -k 1m 10m \ ${build_dir}/build-lbann/unit_test/seq-catch-tests \ -r console \ -r JUnit::out=${project_dir}/seq-tests_junit.xml || { @@ -43,7 +43,7 @@ then case "${cluster}" in pascal) export OMPI_MCA_mpi_warn_on_fork=0 - timeout -k 1m 2m \ + timeout -k 1m 10m \ srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ -D ${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \\ @@ -55,7 +55,7 @@ then echo "******************************" # -r mpicumulative \ } - timeout -k 1m 2m \ + timeout -k 1m 10m \ srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ -D ${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ @@ -69,7 +69,7 @@ then } ;; lassen) - timeout -k 1m 2m \ + timeout -k 1m 10m \ jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ @@ -80,7 +80,7 @@ then echo "******************************" # -r mpicumulative \ } - timeout -k 1m 2m \ + timeout -k 1m 10m \ jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ -h ${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ @@ -95,7 +95,7 @@ then ;; corona|tioga) export H2_SELECT_DEVICE_0=1 - timeout -k 1m 2m \ + timeout -k 1m 10m \ flux run -N1 -n8 -g1 --exclusive \ --cwd=${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ @@ -109,7 +109,7 @@ then } # \ # -o "${project_dir}/mpi-catch-filesystem-results-rank=%r-size=%s.xml" - timeout -k 1m 2m \ + timeout -k 1m 10m \ flux run -N1 -n8 -g1 --exclusive \ --cwd=${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ From c0193450eec9726090648dee5e8edba463a1e7ca Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 1 Aug 2024 22:34:27 -0700 Subject: [PATCH 109/169] Added optimization flags for DHA --- .gitlab/run_unit_and_integration_tests.sh | 3 ++- .gitlab/setup_env.sh | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 4e5edacda7b..e6b3f160766 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -99,6 +99,7 @@ export OMP_NUM_THREADS=10 # These tests are "allowed" to fail inside the script. That is, the # unit tests should be run even if these fail. The status is cached # for now. +status=0 echo "Task: Integration Tests with file pattern: ${TEST_FLAG}" cd integration_tests $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { @@ -123,5 +124,5 @@ $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_r } cd .. -echo "Task: Finished with ${failed_tests} failed tests" +echo "Task: Finished with status ${status} and ${failed_tests} failed tests" #exit $status diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index a90fef3165c..2e3c0564e5d 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -106,8 +106,8 @@ LDFLAGS="${common_linker_flags} ${LDFLAGS}" # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" From 1bc3cd498978840287feb4cf53a94002a9dcfc69 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 2 Aug 2024 09:05:35 -0700 Subject: [PATCH 110/169] Temporarily force rebuild of dependencies. --- .gitlab/build-and-test-common.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index 0a25e359748..7438828571d 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -32,6 +32,7 @@ LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 5 + REBUILD_DEPS: 1 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh From 2ee089db891769dc77d36255ca94928583a90710 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 2 Aug 2024 13:55:28 -0700 Subject: [PATCH 111/169] Fixed typo --- .gitlab/run_catch_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index 84fe5a29276..1f851f823d1 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -46,7 +46,7 @@ then timeout -k 1m 10m \ srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ -D ${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \\ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { failed_tests=$((${failed_tests=} + $?)) From 77fd1a44c2e58cbdbdb654a899108872da4107b5 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 2 Aug 2024 16:54:56 -0700 Subject: [PATCH 112/169] Added Corona to new CI. --- .gitlab-ci.yml | 8 +++++ .gitlab/build-and-test-corona.yml | 54 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 .gitlab/build-and-test-corona.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8b62bd72900..166c467a428 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -158,3 +158,11 @@ pascal testing: include: '.gitlab/build-and-test-pascal.yml' forward: pipeline_variables: true + +corona testing: + stage: run-all-clusters + trigger: + strategy: depend + include: '.gitlab/build-and-test-corona.yml' + forward: + pipeline_variables: true diff --git a/.gitlab/build-and-test-corona.yml b/.gitlab/build-and-test-corona.yml new file mode 100644 index 00000000000..07858c15d12 --- /dev/null +++ b/.gitlab/build-and-test-corona.yml @@ -0,0 +1,54 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +rocm-5-7-1-corona: + variables: + COMPILER_FAMILY: amdclang + MODULES: "rocm/5.7.1 clang/14.0.6-magic openmpi/4.1.2" + extends: .build-and-test-on-corona + +rocm-5-7-1-distconv-corona: + variables: + COMPILER_FAMILY: amdclang + MODULES: "rocm/5.7.1 clang/14.0.6-magic openmpi/4.1.2" + WITH_DISTCONV: "1" + extends: .build-and-test-on-corona + +.build-and-test-on-corona: + stage: build + tags: [corona, batch] + extends: .build-and-test From 47d9973f2d9bafa72f58dfdc3085f1acd0c14025 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 5 Aug 2024 23:10:59 -0700 Subject: [PATCH 113/169] Added config for Lassen. --- .gitlab/build-and-test-lassen.yml | 56 ++++++++++++++++++++++++++ .gitlab/build-and-test.sh | 3 +- .gitlab/configure_deps.sh | 8 ++-- .gitlab/setup_env.sh | 5 +++ scripts/superbuild/ci/ci_pascal_env.sh | 2 +- 5 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 .gitlab/build-and-test-lassen.yml diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml new file mode 100644 index 00000000000..bf2a04fdded --- /dev/null +++ b/.gitlab/build-and-test-lassen.yml @@ -0,0 +1,56 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.8.2" + extends: .build-and-test-on-lassen + +clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.8.2" + WITH_DISTCONV: "1" + extends: .build-and-test-on-lassen + rules: + - if: $TEST_DISTCONV_BUILD == "1" + +.build-and-test-on-lassen: + stage: build + tags: [lassen, batch] + extends: .build-and-test diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index f1cde420070..135e0255ccc 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -97,7 +97,8 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" # Just for good measure... -export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen::${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${CMAKE_PREFIX_PATH} #export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/catch2:${prefix}/hwloc:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${prefix}/spdlog:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index f0f43b3ff10..61445a257bc 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -1,7 +1,9 @@ if [[ "$cluster" == "lassen" ]] then - hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" - dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" + hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=IBMESSL" + dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=IBMESSL" + # hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" + # dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" else hydrogen_lapack_opt="" dihydrogen_lapack_opt="" @@ -22,6 +24,7 @@ cmake \ -D CMAKE_BUILD_RPATH=${extra_rpaths//:/|} \ -D CMAKE_INSTALL_RPATH=${extra_rpaths//:/|} \ \ + -D BUILD_SHARED_LIBS=ON \ -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ @@ -54,7 +57,6 @@ cmake \ -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ \ -D LBANN_SB_BUILD_Hydrogen=ON \ - ${hydrogen_lapack_opt} \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 2e3c0564e5d..c7a60c0dd68 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -65,6 +65,10 @@ case "${cluster}" in cuda_platform=ON gpu_arch=70 launcher=lsf + CUDA_VER=cuda-12.2.2 + COMPILER_VER=clang-16.0.6 + SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/spectrum-mpi-rolling-release + export CMAKE_PREFIX_PATH="${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/nccl_2.20.3-1+cuda12.2_ppc64le:${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/cudnn-linux-ppc64le-8.9.7.29_cuda12-archive:${CMAKE_PREFIX_PATH:-""}" ;; tioga) cray_libs_dir=${CRAYLIBS_X86_64:-""} @@ -94,6 +98,7 @@ case "${cluster}" in ;; esac +export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-""} source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh index b8a8003d88b..6118315ddf2 100644 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ b/scripts/superbuild/ci/ci_pascal_env.sh @@ -47,5 +47,5 @@ INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/openmpi-4.1.2 BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/pascal/${CUDA_VER}/${COMPILER_VER} # Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-2.0.1.2/libcutensor-linux-x86_64-2.0.1.2-archive +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From 84de254a607e37748a6fd5217eb9b6838fb5b9dc Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 09:56:04 -0700 Subject: [PATCH 114/169] Fixed how the lapack argument is passed to Hydrogen --- .gitlab/configure_deps.sh | 9 +++++---- .gitlab/configure_lbann.sh | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 61445a257bc..1852f5e2012 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -1,9 +1,9 @@ if [[ "$cluster" == "lassen" ]] then - hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=IBMESSL" - dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=IBMESSL" - # hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" - # dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" + # hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=IBMESSL" + # dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=IBMESSL" + hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" + dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" else hydrogen_lapack_opt="" dihydrogen_lapack_opt="" @@ -57,6 +57,7 @@ cmake \ -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ \ -D LBANN_SB_BUILD_Hydrogen=ON \ + ${hydrogen_lapack_opt} \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index d8e5462b618..7c0e3cf8de5 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,3 +1,10 @@ +if [[ "$cluster" == "lassen" ]] +then + # lbann_lapack_opt="-D LBANN_BLA_VENDOR=IBMESSL" + lbann_lapack_opt="-D LBANN_BLA_VENDOR=Generic" +else + lbann_lapack_opt="" +fi cmake -G Ninja \ -S ${project_dir} \ -B ${build_dir}/build-lbann \ @@ -26,6 +33,7 @@ cmake -G Ninja \ -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \ -D CMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D CMAKE_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + ${lbann_lapack_opt} \ -D LBANN_DATATYPE=float \ -D LBANN_WITH_CALIPER=OFF \ -D LBANN_WITH_DISTCONV=${build_distconv:-OFF} \ From ab6e9d2d9172d1d1b94edab9d4eedbca1cc1fff7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 10:13:00 -0700 Subject: [PATCH 115/169] Fixed flag for LBANN BLA. --- .gitlab/configure_lbann.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 7c0e3cf8de5..df7f2a53886 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,7 +1,7 @@ if [[ "$cluster" == "lassen" ]] then # lbann_lapack_opt="-D LBANN_BLA_VENDOR=IBMESSL" - lbann_lapack_opt="-D LBANN_BLA_VENDOR=Generic" + lbann_lapack_opt="-D BLA_VENDOR=Generic" else lbann_lapack_opt="" fi From a7219fa43261b8ed9f992cdbb9f1a7e5108262e9 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 10:34:33 -0700 Subject: [PATCH 116/169] Added scripts to install core dependencies for lassen. --- .../ci/ci_lassen_core_dependencies.sh | 103 ++++++++++++++++++ scripts/superbuild/ci/ci_lassen_env.sh | 53 +++++++++ 2 files changed, 156 insertions(+) create mode 100755 scripts/superbuild/ci/ci_lassen_core_dependencies.sh create mode 100755 scripts/superbuild/ci/ci_lassen_env.sh diff --git a/scripts/superbuild/ci/ci_lassen_core_dependencies.sh b/scripts/superbuild/ci/ci_lassen_core_dependencies.sh new file mode 100755 index 00000000000..2d00ca3eea3 --- /dev/null +++ b/scripts/superbuild/ci/ci_lassen_core_dependencies.sh @@ -0,0 +1,103 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +source ${SUPERBUILD_SRC_DIR}/ci/ci_lassen_env.sh + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies + +#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + \ + -D CMAKE_C_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ + -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ + -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ + -D CMAKE_Fortran_COMPILER=$(which gfortran) \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ + \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ + -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF \ + -D LBANN_SB_OpenCV_TAG=4.x \ + -D LBANN_SB_OpenCV_TAG=4.x + +# Save a list of the currently loaded modules +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_lassen_env.sh b/scripts/superbuild/ci/ci_lassen_env.sh new file mode 100755 index 00000000000..b3ba9d62293 --- /dev/null +++ b/scripts/superbuild/ci/ci_lassen_env.sh @@ -0,0 +1,53 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" + +# Prefer RPATH to RUNPATH (stability over flexibility) +EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" +EXTRA_LINK_FLAGS="-fuse-ld=lld ${EXTRA_LINK_FLAGS_CORE}" + +# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) +CUDA_GPU_ARCH=70 + +#CUDA_VER=cuda-11.8.0 +CUDA_VER=cuda-12.2.2 +COMPILER_VER=clang-16.0.6 +#COMPILER_VER=clang-14.0.6-magic +# Set to the preferred install directory +CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies +INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/lassen/${CUDA_VER} +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/spectrum-mpi-rolling-release + +# Use an accessible build directory so that the source files are preserved for debuggin +BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/lassen/${CUDA_VER}/${COMPILER_VER} + +# Location of external packages +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From 4a10d9a97dfcc193f5fd20003612c37a5297e2d6 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 11:28:12 -0700 Subject: [PATCH 117/169] Added Lassen CI. --- .gitlab-ci.yml | 8 ++++++++ scripts/superbuild/ci/ci_lassen_env.sh | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 166c467a428..6ddc24652f1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -166,3 +166,11 @@ corona testing: include: '.gitlab/build-and-test-corona.yml' forward: pipeline_variables: true + +lassen testing: + stage: run-all-clusters + trigger: + strategy: depend + include: '.gitlab/build-and-test-lassen.yml' + forward: + pipeline_variables: true diff --git a/scripts/superbuild/ci/ci_lassen_env.sh b/scripts/superbuild/ci/ci_lassen_env.sh index b3ba9d62293..ab5d785b909 100755 --- a/scripts/superbuild/ci/ci_lassen_env.sh +++ b/scripts/superbuild/ci/ci_lassen_env.sh @@ -49,5 +49,5 @@ INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/spectrum-mpi-rolling-re BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/lassen/${CUDA_VER}/${COMPILER_VER} # Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive +export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From 51f7f98cc823b6752c10b617c095b0fb6c643051 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 14:00:29 -0700 Subject: [PATCH 118/169] Adding in some help for extra rpaths. --- .gitlab/build-and-test.sh | 5 +---- .gitlab/configure_deps.sh | 2 -- .gitlab/configure_lbann.sh | 7 ++++++- .gitlab/setup_env.sh | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 135e0255ccc..8cdeac4c063 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -98,12 +98,8 @@ prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" # Just for good measure... export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen::${CMAKE_PREFIX_PATH} -#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${CMAKE_PREFIX_PATH} -#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/catch2:${prefix}/hwloc:${prefix}/hydrogen:${prefix}/dihydrogen:${prefix}/nccl:${prefix}/spdlog:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -echo "I am going to export the the prefix path ${CMAKE_PREFIX_PATH}" - # Allow a user to force this rebuild_deps=${REBUILD_DEPS:-""} @@ -204,6 +200,7 @@ fi echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Building LBANN" echo "~~~~~ $(date)" +echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix=${build_dir}/install diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 1852f5e2012..f2b62a9adab 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -1,7 +1,5 @@ if [[ "$cluster" == "lassen" ]] then - # hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=IBMESSL" - # dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=IBMESSL" hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" else diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index df7f2a53886..cafab5c991c 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -29,7 +29,6 @@ cmake -G Ninja \ -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ \ - -D BUILD_SHARED_LIBS=ON \ -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \ -D CMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D CMAKE_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ @@ -46,3 +45,9 @@ cmake -G Ninja \ -D LBANN_WITH_EMBEDDED_PYTHON=ON \ -D LBANN_WITH_PYTHON_FRONTEND=ON \ -D LBANN_WITH_VISION=ON + + # -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ + # -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ + # -D CMAKE_SKIP_BUILD_RPATH=OFF \ + # -D CMAKE_SKIP_INSTALL_RPATH=OFF \ + # -D CMAKE_SKIP_RPATH=OFF \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index c7a60c0dd68..1053c62c52a 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -74,9 +74,9 @@ case "${cluster}" in cray_libs_dir=${CRAYLIBS_X86_64:-""} if [[ -n "${cray_libs_dir}" ]] then - extra_rpaths="${cray_libs_dir}:${ROCM_PATH}/lib:${extra_rpaths}" + extra_rpaths="${cray_libs_dir}:${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" else - extra_rpaths="${ROCM_PATH}/lib:${extra_rpaths}" + extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi rocm_platform=ON gpu_arch=gfx90a @@ -86,7 +86,7 @@ case "${cluster}" in SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-8.1.29 ;; corona) - extra_rpaths="${ROCM_PATH}/lib:${extra_rpaths}" + extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" rocm_platform=ON gpu_arch=gfx906 launcher=flux @@ -111,8 +111,8 @@ LDFLAGS="${common_linker_flags} ${LDFLAGS}" # Improve debugging info and remove some misguided warnings. These are # passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-O2 -g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-O2 -g3 -Wno-deprecated-declarations" +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" From 6737e4ad20aa97265677c8ac2d6882c3bedd0e2b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 14:20:30 -0700 Subject: [PATCH 119/169] Force LBANN to RPATH DHA libraries inside of the project. --- .gitlab/build-and-test.sh | 3 ++- .gitlab/configure_lbann.sh | 5 +++++ .gitlab/pascal/single_pipeline.yml | 2 +- .gitlab/tioga/pipeline.yml | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 8cdeac4c063..4d1eb46bd5c 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -95,9 +95,10 @@ echo "~~~~~ Project dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" +dha_prefix=${prefix} # Just for good measure... -export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen::${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index cafab5c991c..3226c34e5eb 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -5,6 +5,11 @@ then else lbann_lapack_opt="" fi + +# Just for good measure... +extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} + + cmake -G Ninja \ -S ${project_dir} \ -B ${build_dir}/build-lbann \ diff --git a/.gitlab/pascal/single_pipeline.yml b/.gitlab/pascal/single_pipeline.yml index 81a07205f93..8becbd000f3 100644 --- a/.gitlab/pascal/single_pipeline.yml +++ b/.gitlab/pascal/single_pipeline.yml @@ -1,5 +1,5 @@ ################################################################################ -## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. ## Produced at the Lawrence Livermore National Laboratory. ## Written by the LBANN Research Team (B. Van Essen, et al.) listed in ## the CONTRIBUTORS file. diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 9bfbc2f0633..708da83f520 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -1,5 +1,5 @@ ################################################################################ -## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. ## Produced at the Lawrence Livermore National Laboratory. ## Written by the LBANN Research Team (B. Van Essen, et al.) listed in ## the CONTRIBUTORS file. From 9b5cd55b647bf9116283a8eda1145e22293a3201 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 15:19:57 -0700 Subject: [PATCH 120/169] Improve the reporting of the MPI catch tests. Consolidated all of the MPI catch tests to a single execution. Avoid logging unit and catch testing outputs to console. --- .gitlab/build-and-test-common.yml | 2 +- .gitlab/build-and-test-lassen.yml | 2 - .gitlab/run_catch_tests.sh | 177 ++++++++-------------- .gitlab/run_unit_and_integration_tests.sh | 13 +- cmake/modules/SetupMPI.cmake | 4 +- 5 files changed, 74 insertions(+), 124 deletions(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index 7438828571d..b6e3cf2e143 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -32,7 +32,7 @@ LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 5 - REBUILD_DEPS: 1 +# REBUILD_DEPS: 1 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index bf2a04fdded..329979e0953 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -47,8 +47,6 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.8.2" WITH_DISTCONV: "1" extends: .build-and-test-on-lassen - rules: - - if: $TEST_DISTCONV_BUILD == "1" .build-and-test-on-lassen: stage: build diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index 1f851f823d1..9f21d9de90b 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -6,126 +6,73 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" timeout -k 1m 10m \ ${build_dir}/build-lbann/unit_test/seq-catch-tests \ -r console \ - -r JUnit::out=${project_dir}/seq-tests_junit.xml || { + -r JUnit::out=${project_dir}/seq-tests_junit.xml \ + || { failed_tests=$(( ${failed_tests} + $? )) echo "******************************" - echo " >>> SeqCatchTests FAILED" + echo " >>> seq-catch-tests FAILED" echo "******************************" } -# if [[ -e "${build_dir}/build-lbann/bin/GPUCatchTests" ]] -# then -# echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -# echo "~~~~~ GPU tests" -# echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - -# timeout -k 1m 2m \ -# ${build_dir}/build-lbann/bin/GPUCatchTests \ -# -r console \ -# -r JUnit::out=${project_dir}/gpu-tests_junit.xml || { -# failed_tests=$(( ${failed_tests} + $? )) -# echo "******************************" -# echo " >>> GPUCatchTests FAILED" -# echo "******************************" -# } -# fi - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ MPI tests" +echo "~~~~~ MPI Catch Tests" +echo "----- LBANN output logged to: ${project_dir}/lbann-log-mpi-catch-tests.log" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -# FIXME (trb 20240702): There's an issue with needing each process to -# write out its own coverage database file, and I just need to sort -# through that before this will work. So for now, just run these when -# not running coverage tests. -if [[ -z "${run_coverage}" ]] -then - case "${cluster}" in - pascal) - export OMPI_MCA_mpi_warn_on_fork=0 - timeout -k 1m 10m \ - srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ - -D ${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ - -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests=} + $?)) - echo "******************************" - echo " >>> MPICatchTests FAILED" - echo "******************************" -# -r mpicumulative \ - } - timeout -k 1m 10m \ - srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ - -D ${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ - -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests} + $?)) - echo "******************************" - echo " >>> mpi-catch-tests [filesystem] FAILED" - echo "******************************" - # -r mpicumulative \ - } - ;; - lassen) - timeout -k 1m 10m \ - jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ - -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests} + $?)) - echo "******************************" - echo " >>> mpi-catch-tests FAILED" - echo "******************************" -# -r mpicumulative \ - } - timeout -k 1m 10m \ - jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ - -h ${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ - -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests} + $?)) - echo "******************************" - echo " >>> mpi-catch-tests [filesystem] FAILED" - echo "******************************" - # -r mpicumulative \ - } - ;; - corona|tioga) - export H2_SELECT_DEVICE_0=1 - timeout -k 1m 10m \ - flux run -N1 -n8 -g1 --exclusive \ - --cwd=${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests "exclude:[random]" "exclude:[filesystem]" \ - -r console::out=${project_dir}/mpi-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests} + $?)) - echo "******************************" - echo " >>> mpi-catch-tests FAILED" - echo "******************************" - # -r mpicumulative \ - } -# \ -# -o "${project_dir}/mpi-catch-filesystem-results-rank=%r-size=%s.xml" - timeout -k 1m 10m \ - flux run -N1 -n8 -g1 --exclusive \ - --cwd=${build_dir}/build-lbann \ - ${build_dir}/build-lbann/unit_test/mpi-catch-tests -s "[filesystem]" \ - -r console::out=${project_dir}/mpi-catch-filesystem-tests-console-rank=%r-size=%s.log \ - -r JUnit::out=${project_dir}/mpi-catch-filesystem-tests-rank=%r-size=%s_junit.xml || { - failed_tests=$((${failed_tests} + $?)) - echo "******************************" - echo " >>> mpi-catch-tests [filesystem] FAILED" - echo "******************************" - # -r mpicumulative \ - } - ;; - *) - echo "Unknown cluster: ${cluster}" - ;; - esac -else - echo "NOTE: Skipping MPI tests." -fi +case "${cluster}" in + pascal) + export OMPI_MCA_mpi_warn_on_fork=0 + timeout -k 1m 10m \ + srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ + -D ${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests=} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + lassen) + timeout -k 1m 10m \ + jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + corona|tioga) + export H2_SELECT_DEVICE_0=1 + timeout -k 1m 10m \ + flux run -N1 -n8 -g1 --exclusive \ + --cwd=${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + *) + echo "Unknown cluster: ${cluster}" + ;; +esac + +for filename in ${project_dir}/mpi-catch-tests-console-rank=*.log; do + [ -e "$filename" ] || continue + echo "$filename" + cat $filename +done diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index e6b3f160766..555467694b6 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -5,7 +5,10 @@ LBANN_DIR=$(git rev-parse --show-toplevel) cd ${LBANN_DIR}/ci_test -echo "${PWD}/run.sh CLUSTER=${CLUSTER}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Running Integration and Unit tests" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +#echo "${PWD}/run.sh CLUSTER=${CLUSTER}" PYTHON=python3 LBANN_PYTHON=lbann_pfe.sh @@ -100,9 +103,11 @@ export OMP_NUM_THREADS=10 # unit tests should be run even if these fail. The status is cached # for now. status=0 +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "Task: Integration Tests with file pattern: ${TEST_FLAG}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" cd integration_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { +$LBANN_PYTHON -m pytest -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { this_status=$? status=$(( $status + $this_status )) failed_tests=$(( $failed_tests + $this_status )) @@ -112,9 +117,11 @@ $LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/integration } cd .. +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "Task: Unit Tests with file pattern: ${TEST_FLAG}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" cd unit_tests -$LBANN_PYTHON -m pytest -s -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml ${TEST_FLAG} || { +$LBANN_PYTHON -m pytest -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml ${TEST_FLAG} || { this_status=$? status=$(( $status + $this_status )) failed_tests=$(( $failed_tests + $this_status )) diff --git a/cmake/modules/SetupMPI.cmake b/cmake/modules/SetupMPI.cmake index f6491694a02..0f677866c11 100644 --- a/cmake/modules/SetupMPI.cmake +++ b/cmake/modules/SetupMPI.cmake @@ -26,9 +26,7 @@ # This module configures MPI and ensures the library is setup properly -if (NOT MPI_CXX_FOUND) - find_package(MPI REQUIRED COMPONENTS CXX) -endif () +find_package(MPI REQUIRED COMPONENTS C CXX) if (NOT TARGET MPI::MPI_CXX) add_library(MPI::MPI_CXX INTERFACE IMPORTED) From e6904b046f570905305264c94b8cb00e5b6775ec Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2024 15:26:13 -0700 Subject: [PATCH 121/169] Force rebuild again. --- .gitlab/build-and-test-common.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index b6e3cf2e143..7438828571d 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -32,7 +32,7 @@ LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 5 -# REBUILD_DEPS: 1 + REBUILD_DEPS: 1 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh From 7e49f85c95b5bcfbbb7cec05908092b5bf55e69f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 7 Aug 2024 09:56:26 -0700 Subject: [PATCH 122/169] Updated Lassen to use a newer python. Tweaking how rpath's are set. --- .gitlab/build-and-test-lassen.yml | 4 ++-- .gitlab/build-and-test.sh | 2 +- .gitlab/configure_lbann.sh | 13 ++++++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index 329979e0953..2421c07d038 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -38,13 +38,13 @@ include: clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.8.2" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" extends: .build-and-test-on-lassen clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.8.2" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" WITH_DISTCONV: "1" extends: .build-and-test-on-lassen diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 4d1eb46bd5c..99275c6c813 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -220,7 +220,7 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm" +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" echo ${CMD} ${CMD} diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 3226c34e5eb..6cb17a4b1c3 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -7,8 +7,15 @@ else fi # Just for good measure... -extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} +if [ -z "${extra_rpaths}" ]; then + extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64 +else + extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} +fi +echo "I have modified the extra rpaths to be ${extra_rpaths}" + # -D CMAKE_BUILD_RPATH="${extra_rpaths//:/\;}" \ + # -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/\;}" \ cmake -G Ninja \ -S ${project_dir} \ @@ -18,8 +25,8 @@ cmake -G Ninja \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${prefix}/lbann \ \ - -D CMAKE_BUILD_RPATH="${extra_rpaths//:/\;}" \ - -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/\;}" \ + -D CMAKE_BUILD_RPATH="${extra_rpaths}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths}" \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_CXX_STANDARD=17 \ From 0b99f252bd6f501446fca7abbd1f767ca2f0aaac Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 7 Aug 2024 11:51:34 -0700 Subject: [PATCH 123/169] Fixed quoting on RPATH --- .gitlab/configure_deps.sh | 4 ++-- .gitlab/configure_lbann.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index f2b62a9adab..3cded5a1667 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -19,8 +19,8 @@ cmake \ -D CMAKE_EXE_LINKER_FLAGS=${common_linker_flags} \ -D CMAKE_SHARED_LINKER_FLAGS=${common_linker_flags} \ \ - -D CMAKE_BUILD_RPATH=${extra_rpaths//:/|} \ - -D CMAKE_INSTALL_RPATH=${extra_rpaths//:/|} \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/|}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/|}" \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 6cb17a4b1c3..4081f04d8ad 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -25,8 +25,8 @@ cmake -G Ninja \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_INSTALL_PREFIX=${prefix}/lbann \ \ - -D CMAKE_BUILD_RPATH="${extra_rpaths}" \ - -D CMAKE_INSTALL_RPATH="${extra_rpaths}" \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/;}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/;}" \ -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ \ -D CMAKE_CXX_STANDARD=17 \ From 2f405956f7570dff5afe98bcc83ce8d318aeee37 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 7 Aug 2024 13:21:05 -0700 Subject: [PATCH 124/169] Fixed the path for the catch tests. --- .gitlab/run_catch_tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh index 9f21d9de90b..c5df67dd9b5 100644 --- a/.gitlab/run_catch_tests.sh +++ b/.gitlab/run_catch_tests.sh @@ -39,6 +39,7 @@ case "${cluster}" in lassen) timeout -k 1m 10m \ jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ + -h ${build_dir}/build-lbann \ ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ From c10ab02fea15dfe6cb7af8509a17f39754c799d2 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 10:25:32 -0700 Subject: [PATCH 125/169] Fixed up a few shell details to make switching PEs simpler. --- .../superbuild/ci/ci_tioga_core_dependencies.sh | 14 ++++++++++++++ scripts/superbuild/ci/ci_tioga_env.sh | 7 ++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh index e46c1badf3e..2a9db32eed6 100755 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh @@ -24,6 +24,13 @@ ## permissions and limitations under the license. ################################################################################ +python_pkgs="ninja" +export PYTHONUSERBASE=${TMPDIR}/${USER}/python/${cluster} +export PATH=${PYTHONUSERBASE}/bin:${PATH} +python3 -m pip install --user ${python_pkgs} +# Make sure the PYTHONPATH is all good. +export PYTHONPATH=$(ls --color=no -1 -d ${PYTHONUSERBASE}/lib/python*/site-packages | paste -sd ":" - ):${PYTHONPATH:-""} + # Set to ON (or any CMake truthy value) to build all of the # dependencies of the LBANN stack BUILD_EXTERNAL_TPLS=ON @@ -55,6 +62,10 @@ source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh # Set to the preferred install directory INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} +if [ ! -e ${INSTALL_PREFIX} ]; then + mkdir -p ${INSTALL_PREFIX} +fi + # Set to the preferred build directory BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies @@ -113,4 +124,7 @@ cmake \ -D LBANN_SB_BUILD_AWS_OFI_RCCL=${BUILD_AWS_OFI_RCCL_PLUGIN}} # Save a list of the currently loaded modules +if [ ! -e ${INSTALL_PREFIX}/logs ]; then + mkdir -p ${INSTALL_PREFIX}/logs +fi module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 9fa5b3048e5..9657389ae36 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -42,7 +42,12 @@ AMD_GPU_ARCH=gfx90a,gfx942 ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER}/${PE_ENV_lc} -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-8.1.29/ +INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-${CRAY_MPICH_VERSION} + +if [[ "${PE_ENV_lc}" == "cray" ]]; then + # If using PrgEnv-cray add ${CRAYLIBS_X86_64} + EXTRA_RPATHS="${CRAYLIBS_X86_64}|${EXTRA_RPATHS}" +fi # Use an accessible build directory so that the source files are preserved for debuggin BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${ROCM_VER}/${PE_ENV_lc} From 21aaba73e302b7b520dc8c6f500c144a2ae97712 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 10:26:45 -0700 Subject: [PATCH 126/169] Building for Mi300A as well as 250. --- .gitlab/setup_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 1053c62c52a..8d9ecb24c24 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -79,7 +79,7 @@ case "${cluster}" in extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi rocm_platform=ON - gpu_arch=gfx90a + gpu_arch=gfx90a,gfx942 launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') From 2ca99eb4af8e1c1fc6acd1042a152e53335f7930 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 10:40:34 -0700 Subject: [PATCH 127/169] Stop hardcoding the CRAY_MPICH_VERSION --- .gitlab/setup_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 8d9ecb24c24..4e8393a1066 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -83,7 +83,7 @@ case "${cluster}" in launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') - SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-8.1.29 + SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-${CRAY_MPICH_VERSION} ;; corona) extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" From 696544f2b3ea4989cc001fb1762b675a272d265f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 16:04:33 -0700 Subject: [PATCH 128/169] Added the ability to export the AWS_OFI_RCCL plugin to the LD_LIBRARY_PATH when using the lbann_pfe.sh shell script. --- CMakeLists.txt | 1 + cmake/configure_files/lbann_pfe.sh.in | 2 ++ scripts/superbuild/CMakeLists.txt | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a4a51a5d4b..2d2cccb57a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -589,6 +589,7 @@ endif (LBANN_WITH_PYTHON_FRONTEND OR LBANN_WITH_EMBEDDED_PYTHON) if (LBANN_WITH_PYTHON_FRONTEND) set(LBANN_PFE_PYTHON_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "") set(LBANN_PFE_PYTHONPATH "${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}" CACHE STRING "") + set(AWS_OFI_RCCL_LIBRARY "$ENV{AWS_OFI_RCCL_LIBRARY}" CACHE STRING "") configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_pfe.sh.in" "${CMAKE_BINARY_DIR}/lbann_pfe.sh" diff --git a/cmake/configure_files/lbann_pfe.sh.in b/cmake/configure_files/lbann_pfe.sh.in index 402981f967c..e562add65c6 100644 --- a/cmake/configure_files/lbann_pfe.sh.in +++ b/cmake/configure_files/lbann_pfe.sh.in @@ -25,4 +25,6 @@ ################################################################################ #!/bin/sh export PYTHONPATH=@LBANN_PFE_PYTHONPATH@:${PYTHONPATH} +export AWS_OFI_RCCL_LIBRARY=@AWS_OFI_RCCL_LIBRARY@ +export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBRARY@:${LD_LIBRARY_PATH} @LBANN_PFE_PYTHON_EXECUTABLE@ "$@" diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 7918b3f7c49..b4c46554a12 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -154,6 +154,14 @@ if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBARY_PATH\}\n") message("-----------------------------------------------------------------\n") endif () +file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") +file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") +file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") +file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") if (LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND) message("-----------------------------------------------------------------\n") From cd4cec48832d12630b512970b611c84219e1058f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 16:14:30 -0700 Subject: [PATCH 129/169] Tweak the Tioga build environment. --- .gitlab/build-and-test-tioga.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 6931081dd0d..5401fa37599 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -38,13 +38,13 @@ include: rocm-5-7-1-tioga: variables: COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-amd amd/5.7.1 rocm/5.7.1 cray-mpich cray-libsci" + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" extends: .build-and-test-on-tioga rocm-5-7-1-distconv-tioga: variables: COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-amd amd/5.7.1 rocm/5.7.1 cray-mpich cray-libsci" + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" WITH_DISTCONV: "1" extends: .build-and-test-on-tioga From 60ff967e54ec49f258606b988c46912ff0b36e67 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 16:15:30 -0700 Subject: [PATCH 130/169] Work on building the dependencies on PrgEnv-cray. --- .gitlab/build-and-test-lassen.yml | 4 ++-- .gitlab/build-and-test.sh | 22 +++++++++++++++++++--- .gitlab/configure_lbann.sh | 19 +++++++------------ .gitlab/setup_env.sh | 2 ++ scripts/superbuild/ci/ci_tioga_env.sh | 3 ++- 5 files changed, 32 insertions(+), 18 deletions(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index 2421c07d038..a14b11a0a76 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -38,13 +38,13 @@ include: clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" extends: .build-and-test-on-lassen clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" WITH_DISTCONV: "1" extends: .build-and-test-on-lassen diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 99275c6c813..ead88c10558 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -95,20 +95,25 @@ echo "~~~~~ Project dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" +#dha_prefix=${INSTALL_EXTERNALS_ROOT}/rocm-5.7.1/amd/cray-mpich-8.1.29/dha_with_distconv dha_prefix=${prefix} # Just for good measure... -export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this rebuild_deps=${REBUILD_DEPS:-""} +#rebuild_deps=0 # Rebuild if the prefix doesn't exist. -if [[ ! -d "${prefix}" ]] +#if [[ ! -d "${prefix}" ]] +if [[ ! -d "${dha_prefix}" ]] then rebuild_deps=1 fi +#rebuild_deps=0 # Rebuild if latest hashes don't match if [[ -z "${rebuild_deps}" ]] @@ -220,10 +225,21 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann protobuf tqdm numpy scipy" echo ${CMD} ${CMD} +case "${cluster}" in + pascal) + CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann torch" + echo ${CMD} + ${CMD} + ;; + *) + echo "Unable to install torch via pip on ${cluster}" + ;; +esac + LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles #echo "I think that the module is in ${LBANN_MODFILES_DIR}" ml use ${LBANN_MODFILES_DIR} diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 4081f04d8ad..acdd428395b 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,22 +1,23 @@ if [[ "$cluster" == "lassen" ]] then - # lbann_lapack_opt="-D LBANN_BLA_VENDOR=IBMESSL" lbann_lapack_opt="-D BLA_VENDOR=Generic" + build_fft=ON else lbann_lapack_opt="" fi -# Just for good measure... +if [[ "$cluster" == "tioga" ]] +then + build_fft=OFF +fi + +# Default RPATH rules will not include in-source libraries from the prefix path... add them here. if [ -z "${extra_rpaths}" ]; then extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64 else extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} fi -echo "I have modified the extra rpaths to be ${extra_rpaths}" - # -D CMAKE_BUILD_RPATH="${extra_rpaths//:/\;}" \ - # -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/\;}" \ - cmake -G Ninja \ -S ${project_dir} \ -B ${build_dir}/build-lbann \ @@ -57,9 +58,3 @@ cmake -G Ninja \ -D LBANN_WITH_EMBEDDED_PYTHON=ON \ -D LBANN_WITH_PYTHON_FRONTEND=ON \ -D LBANN_WITH_VISION=ON - - # -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ - # -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ - # -D CMAKE_SKIP_BUILD_RPATH=OFF \ - # -D CMAKE_SKIP_INSTALL_RPATH=OFF \ - # -D CMAKE_SKIP_RPATH=OFF \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 4e8393a1066..6d724ffb944 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -78,6 +78,7 @@ case "${cluster}" in else extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi + extra_rpaths="/usr/workspace/lbann/ci_stable_dependencies/tioga/rocm-5.7.1/cray/cray-mpich-8.1.30/aws_ofi_rccl/lib:${extra_rpaths}" rocm_platform=ON gpu_arch=gfx90a,gfx942 launcher=flux @@ -103,6 +104,7 @@ source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_s export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +echo "BVE HERE I think that AWS_OFI_RCCL_LIBRARY=${AWS_OFI_RCCL_LIBRARY}" CFLAGS=${CFLAGS:-""} CXXFLAGS=${CXXFLAGS:-""} diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 9657389ae36..679a35b9a98 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -44,9 +44,10 @@ PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER}/${PE_ENV_lc} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-${CRAY_MPICH_VERSION} -if [[ "${PE_ENV_lc}" == "cray" ]]; then +if [[ "${PE_ENV_lc}" = "cray" ]]; then # If using PrgEnv-cray add ${CRAYLIBS_X86_64} EXTRA_RPATHS="${CRAYLIBS_X86_64}|${EXTRA_RPATHS}" + export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} fi # Use an accessible build directory so that the source files are preserved for debuggin From 1f6fee947ff622e82b19c4d91ce2ab4a1b647165 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 13 Aug 2024 16:41:51 -0700 Subject: [PATCH 131/169] Fixed accidental debugging code. --- .gitlab/build-and-test-tioga.yml | 14 ++++++++++++++ .gitlab/build-and-test.sh | 12 ++++-------- .gitlab/setup_env.sh | 9 +++++++-- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 5401fa37599..617bd347dc3 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -48,6 +48,20 @@ rocm-5-7-1-distconv-tioga: WITH_DISTCONV: "1" extends: .build-and-test-on-tioga + +rocm-6-2-0-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + extends: .build-and-test-on-tioga + +rocm-6-2-0-distconv-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + WITH_DISTCONV: "1" + extends: .build-and-test-on-tioga + .build-and-test-on-tioga: stage: build tags: [tioga, batch] diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index ead88c10558..6c45682a249 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -95,25 +95,20 @@ echo "~~~~~ Project dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" -#dha_prefix=${INSTALL_EXTERNALS_ROOT}/rocm-5.7.1/amd/cray-mpich-8.1.29/dha_with_distconv dha_prefix=${prefix} # Just for good measure... -export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} -#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this rebuild_deps=${REBUILD_DEPS:-""} -#rebuild_deps=0 # Rebuild if the prefix doesn't exist. -#if [[ ! -d "${prefix}" ]] -if [[ ! -d "${dha_prefix}" ]] +if [[ ! -d "${prefix}" ]] then rebuild_deps=1 fi -#rebuild_deps=0 # Rebuild if latest hashes don't match if [[ -z "${rebuild_deps}" ]] @@ -225,7 +220,8 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +#CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann protobuf tqdm numpy scipy" echo ${CMD} ${CMD} diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 6d724ffb944..465bb8c3306 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -78,7 +78,6 @@ case "${cluster}" in else extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi - extra_rpaths="/usr/workspace/lbann/ci_stable_dependencies/tioga/rocm-5.7.1/cray/cray-mpich-8.1.30/aws_ofi_rccl/lib:${extra_rpaths}" rocm_platform=ON gpu_arch=gfx90a,gfx942 launcher=flux @@ -103,8 +102,14 @@ export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-""} source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} +case "${cluster}" in + tioga) + CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} + ;; + *) + ;; +esac CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -echo "BVE HERE I think that AWS_OFI_RCCL_LIBRARY=${AWS_OFI_RCCL_LIBRARY}" CFLAGS=${CFLAGS:-""} CXXFLAGS=${CXXFLAGS:-""} From d7625313bd8a506a24c10c00c83a481a559ac5c7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 14 Aug 2024 14:06:51 -0700 Subject: [PATCH 132/169] Added DiHydrogen cache check. Only add Half prefix path when asked for. --- .gitlab/build-and-test.sh | 28 +++++++++++++++------------- .gitlab/setup_env.sh | 18 ++++++++++++++++-- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 6c45682a249..7da21a676a4 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -134,6 +134,13 @@ then h_prebuilt=$(cat ${prefix}/h-prebuilt-hash.txt) fi + h2_head=$(fetch-sha dihydrogen develop) + h2_prebuilt="" + if [[ -f "${prefix}/h2-prebuilt-hash.txt" ]] + then + h2_prebuilt=$(cat ${prefix}/h2-prebuilt-hash.txt) + fi + if [[ "${al_head}" != "${al_prebuilt}" ]] then echo "Prebuilt Aluminum hash does not match latest head; rebuilding." @@ -146,6 +153,12 @@ then echo " (prebuilt: ${h_prebuilt}; head: ${h_head})" rebuild_deps=1 fi + if [[ "${h2_head}" != "${h2_prebuilt}" ]] + then + echo "Prebuilt DiHydrogen hash does not match latest head; rebuilding." + echo " (prebuilt: ${h2_prebuilt}; head: ${h2_head})" + rebuild_deps=1 + fi fi if [[ -n "${rebuild_deps}" ]] @@ -157,19 +170,8 @@ then echo "~~~~~ Install dir: ${prefix}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - # Get the superbuild because why not. - lbann_sb_top_dir=${build_dir}/sb - lbann_sb_dir=${lbann_sb_top_dir}/scripts/superbuild - mkdir -p ${lbann_sb_top_dir} - cd ${lbann_sb_top_dir} - - # Sparse checkout of the SuperBuild - git init - git remote add origin https://github.com/llnl/lbann - git fetch --depth=1 origin develop - git config core.sparseCheckout true - echo "scripts/superbuild" >> .git/info/sparse-checkout - git pull --ff-only origin develop + # Set the superbuild dir + lbann_sb_dir=${project_dir}/scripts/superbuild cd ${build_dir} # Uses "${cluster}", "${prefix}", and "${lbann_sb_dir}" diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 465bb8c3306..37f56577a56 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -25,6 +25,10 @@ case "${compiler_family,,}" in CC=${CC:-$(command -v cc)} CXX=${CXX:-$(command -v CC)} ;; + craycc) + CC=${CC:-$(command -v craycc)} + CXX=${CXX:-$(command -v craycxx)} + ;; *) echo "Unknown compiler family: ${compiler_family}. Using gnu." CC=${CC:-$(command -v gcc)} @@ -79,10 +83,18 @@ case "${cluster}" in extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi rocm_platform=ON - gpu_arch=gfx90a,gfx942 + gpu_arch=gfx90a +# gpu_arch=gfx90a,gfx942 launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + case "${compiler_family,,}" in + craycc) + PE_ENV_lc=${PE_ENV_lc}cc + ;; + *) + ;; + esac SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-${CRAY_MPICH_VERSION} ;; corona) @@ -100,7 +112,9 @@ esac export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-""} source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} +if [[ "${build_half}" = "ON" ]]; then + export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} +fi #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} case "${cluster}" in tioga) From edca7955045274e2933d8760b9538ddc9aa4458f Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 15 Aug 2024 09:21:44 -0700 Subject: [PATCH 133/169] Add the hash for H2. --- .gitlab/build-and-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 7da21a676a4..d07fcbe3b04 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -181,6 +181,7 @@ then # Stamp these commits cd ${build_dir}/build-deps/aluminum/src && git rev-parse HEAD > ${prefix}/al-prebuilt-hash.txt cd ${build_dir}/build-deps/hydrogen/src && git rev-parse HEAD > ${prefix}/h-prebuilt-hash.txt + cd ${build_dir}/build-deps/dihydrogen/src && git rev-parse HEAD > ${prefix}/h2-prebuilt-hash.txt echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Dependencies Built" From 7d561fc8a006b1c024c70900c6a670a6e957a26e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 15 Aug 2024 09:53:10 -0700 Subject: [PATCH 134/169] Ensure that for AMD/HIP/ROCm systems all three fields GPU_TARGETS, AMDGPU_TARGETS, and CMAKE_HIP_ARCHITECTURES are set. --- .gitlab/configure_lbann.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index acdd428395b..60bf26f8409 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -36,6 +36,8 @@ cmake -G Ninja \ \ -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + -D AMDGPU_TARGETS=${gpu_arch} \ + -D GPU_TARGETS=${gpu_arch} \ \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ From 77c3b4f782f8bb4f7939178404bdb3621da66175 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 16 Aug 2024 10:45:02 -0700 Subject: [PATCH 135/169] Disable FFT on Lassen --- .gitlab/build-and-test-lassen.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index a14b11a0a76..cc9a2a55b5d 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -35,16 +35,17 @@ stages: include: local: "/.gitlab/build-and-test-common.yml" +# fftw/3.3.10-gcc-11.2.1 clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" extends: .build-and-test-on-lassen clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" WITH_DISTCONV: "1" extends: .build-and-test-on-lassen From 75b42ff69c0293bcb945b56a979ce5b5aaea87ea Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 16 Aug 2024 10:45:51 -0700 Subject: [PATCH 136/169] Disable installing torch. --- .gitlab/build-and-test.sh | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index d07fcbe3b04..7cbbed61409 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -96,9 +96,12 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" dha_prefix=${prefix} +#dha_prefix=${project_dir}/install-deps-tioga_craycc5_distconv_testing +#dha_prefix=${project_dir}/install-deps-tioga_cray5_hip_only_distconv_testing # Just for good measure... -export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this @@ -228,16 +231,16 @@ CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann echo ${CMD} ${CMD} -case "${cluster}" in - pascal) - CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann torch" - echo ${CMD} - ${CMD} - ;; - *) - echo "Unable to install torch via pip on ${cluster}" - ;; -esac +# case "${cluster}" in +# pascal) +# CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann torch" +# echo ${CMD} +# ${CMD} +# ;; +# *) +# echo "Unable to install torch via pip on ${cluster}" +# ;; +# esac LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles #echo "I think that the module is in ${LBANN_MODFILES_DIR}" From 3366b276fb56b16b3607fb719f78b1678753b230 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 16 Aug 2024 10:46:32 -0700 Subject: [PATCH 137/169] Disable FFT on lassen right now. --- .gitlab/configure_lbann.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 60bf26f8409..80f1c7721d8 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,7 +1,7 @@ if [[ "$cluster" == "lassen" ]] then lbann_lapack_opt="-D BLA_VENDOR=Generic" - build_fft=ON + build_fft=OFF else lbann_lapack_opt="" fi @@ -23,7 +23,7 @@ cmake -G Ninja \ -B ${build_dir}/build-lbann \ \ -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_BUILD_TYPE=RelWithDebInfo \ -D CMAKE_INSTALL_PREFIX=${prefix}/lbann \ \ -D CMAKE_BUILD_RPATH="${extra_rpaths//:/;}" \ From 16777c5928bbbd3b32a4d09ee47d0acda166573a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 16 Aug 2024 10:47:36 -0700 Subject: [PATCH 138/169] Set proper AMD architectures. --- .gitlab/setup_env.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 37f56577a56..cb6517ef3b3 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -83,8 +83,7 @@ case "${cluster}" in extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi rocm_platform=ON - gpu_arch=gfx90a -# gpu_arch=gfx90a,gfx942 + gpu_arch=gfx90a,gfx942 launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') From 4e581eaa0b6aa1afd3d8a1366df89e785b13398c Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 16 Aug 2024 18:41:36 -0700 Subject: [PATCH 139/169] Use a special PR for 6.2.0 --- .gitlab/configure_deps.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 3cded5a1667..ed32b90df44 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -59,12 +59,15 @@ cmake \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ \ -D LBANN_SB_BUILD_DiHydrogen=ON \ ${dihydrogen_lapack_opt} \ + -D LBANN_SB_DiHydrogen_TAG=fix-rocm-6-2-0-build \ + -D LBANN_SB_DiHydrogen_URL=https://github.com/benson31/dihydrogen \ -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${build_distconv:-OFF} \ From e34736875a992998d1076847e35d6237207dc579 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sun, 18 Aug 2024 19:35:10 -0700 Subject: [PATCH 140/169] Explicitly turned on the half feature, which is not properly disabled when not set. --- .gitlab/build-and-test-lassen.yml | 2 ++ .gitlab/build-and-test-pascal.yml | 2 +- .gitlab/build-and-test-tioga.yml | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index cc9a2a55b5d..9d2e6b803c3 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -52,4 +52,6 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: .build-and-test-on-lassen: stage: build tags: [lassen, batch] + variables: + WITH_HALF: "1" extends: .build-and-test diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 47a577ff4e7..1bd754dd55d 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -47,7 +47,7 @@ clang-14-0-6-cuda-11-8-0-distconv-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" - WITH_HALF: "0" + WITH_HALF: "1" WITH_FFT: "0" WITH_DISTCONV: "1" extends: [.build-and-test-on-pascal, .build-and-test] diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 617bd347dc3..16e0ef99727 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -67,4 +67,5 @@ rocm-6-2-0-distconv-tioga: tags: [tioga, batch] variables: LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pdebug" + WITH_HALF: "1" extends: .build-and-test From 054776512ed6c39698ff8a83f76b015494c35f77 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sun, 18 Aug 2024 19:53:53 -0700 Subject: [PATCH 141/169] When not using a flag, set it to a NULL string, not 0. --- .gitlab/build-and-test-lassen.yml | 2 +- .gitlab/build-and-test-pascal.yml | 6 +++--- .gitlab/build-and-test-tioga.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index 9d2e6b803c3..a0409640e43 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -53,5 +53,5 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: stage: build tags: [lassen, batch] variables: - WITH_HALF: "1" + WITH_HALF: "" extends: .build-and-test diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 1bd754dd55d..ad8eaed8998 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -40,15 +40,15 @@ clang-14-0-6-cuda-11-8-0-pascal: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" WITH_HALF: "1" - WITH_FFT: "0" + WITH_FFT: "" extends: [.build-and-test-on-pascal, .build-and-test] clang-14-0-6-cuda-11-8-0-distconv-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" - WITH_HALF: "1" - WITH_FFT: "0" + WITH_HALF: "" + WITH_FFT: "" WITH_DISTCONV: "1" extends: [.build-and-test-on-pascal, .build-and-test] diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 16e0ef99727..19e1940ee24 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -67,5 +67,5 @@ rocm-6-2-0-distconv-tioga: tags: [tioga, batch] variables: LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pdebug" - WITH_HALF: "1" + WITH_HALF: "" extends: .build-and-test From 71a5eebfda81b070377ed8e3bda9bd2ec7496ae5 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 19 Aug 2024 17:42:52 -0700 Subject: [PATCH 142/169] Reporting the state of the build script DHA features. --- .gitlab/configure_deps.sh | 9 ++++++++- .gitlab/setup_env.sh | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index ed32b90df44..9598bc568dd 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -7,6 +7,14 @@ else dihydrogen_lapack_opt="" fi +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "----- BVE Dependency Flags:" +echo "----- HALF: ${build_half}" +echo "----- DISTCONV: ${build_distconv}" +echo "----- FFT: ${build_fft}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +# -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ cmake \ -G Ninja \ -S ${lbann_sb_dir} \ @@ -59,7 +67,6 @@ cmake \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index cb6517ef3b3..fd3ced2310b 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -83,7 +83,8 @@ case "${cluster}" in extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi rocm_platform=ON - gpu_arch=gfx90a,gfx942 +# gpu_arch=gfx90a,gfx942 + gpu_arch=gfx90a launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') @@ -133,16 +134,18 @@ LDFLAGS="${common_linker_flags} ${LDFLAGS}" # passed only to the LBANN stack. EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" +# EXTRA_CXX_FLAGS="-v -g3 -Wno-deprecated-declarations" +# EXTRA_HIP_FLAGS="-v -g3 -Wno-deprecated-declarations" # Prefer RPATH to RUNPATH (stability over flexibility) EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" # Update the location of external packages FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=${BUILD_WITH_HALF:-OFF} -BUILD_WITH_DISTCONV=${BUILD_WITH_DISTCONV:-OFF} +# # Set to ON to enable Half support. Only matters if building the +# # LBANN stack. +# BUILD_WITH_HALF=${BUILD_WITH_HALF:-OFF} +# BUILD_WITH_DISTCONV=${BUILD_WITH_DISTCONV:-OFF} # Make sure the compilers and flags are exported export CC CXX CUDACXX CUDAHOSTCXX CFLAGS CXXFLAGS LDFLAGS @@ -166,6 +169,11 @@ echo "~~~~~ CXXFLAGS: ${CXXFLAGS}" echo "~~~~~ LDFLAGS: ${LDFLAGS}" echo "~~~~~ Extra rpaths: ${extra_rpaths}" echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" +echo "-----" +echo "----- Dependency Flags:" +echo "----- HALF: \"${build_half}\"" +echo "----- DISTCONV: \"${build_distconv}\"" +echo "----- FFT: \"${build_fft}\"" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # Handle cuDNN From 70b064577faa55d6f94e90f8ead27c0f7fe17468 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 20 Aug 2024 10:18:58 -0700 Subject: [PATCH 143/169] Set flag to ON not 1 --- .gitlab/build-and-test-pascal.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index ad8eaed8998..0e31bf307ab 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -39,7 +39,7 @@ clang-14-0-6-cuda-11-8-0-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" - WITH_HALF: "1" + WITH_HALF: "ON" WITH_FFT: "" extends: [.build-and-test-on-pascal, .build-and-test] @@ -49,7 +49,7 @@ clang-14-0-6-cuda-11-8-0-distconv-pascal: MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" WITH_HALF: "" WITH_FFT: "" - WITH_DISTCONV: "1" + WITH_DISTCONV: "ON" extends: [.build-and-test-on-pascal, .build-and-test] .build-and-test-on-pascal: From 84d1de74bc991410313918824bc010ed61aea7db Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 20 Aug 2024 11:49:17 -0700 Subject: [PATCH 144/169] Fix when local 6.2.0 MIOpen library is linked in. --- .gitlab/setup_env.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index fd3ced2310b..b27d7b4e7f3 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -118,7 +118,10 @@ fi #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} case "${cluster}" in tioga) - CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} + ROCM_VER=$(basename ${ROCM_PATH}) + if [[ "${ROCM_VER}" = "6.2.0" ]]; then + CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} + fi ;; *) ;; From a6c96d1f884554d6e410a8f2ab14f196328722c0 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 21 Aug 2024 11:18:39 -0700 Subject: [PATCH 145/169] Auto-detect the CUDA version and compiler version. --- .gitlab/setup_env.sh | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index b27d7b4e7f3..42bfcf63ef6 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -36,6 +36,9 @@ case "${compiler_family,,}" in ;; esac +# Set the compiler version based on the path of the compiler +COMPILER_VER=$(basename $(dirname $(dirname $(which ${CC})))) + # HIP/CUDA configuration and launcher are platform-specific CUDACXX=${CUDACXX:=""} CUDAHOSTCXX=${CUDAHOSTCXX:=${CXX}} @@ -59,8 +62,7 @@ case "${cluster}" in cuda_platform=ON gpu_arch=60 launcher=slurm - CUDA_VER=cuda-11.8.0 - COMPILER_VER=clang-14.0.6-magic + CUDA_VER=$(basename ${CUDA_HOME}) SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/openmpi-4.1.2 ;; lassen) @@ -69,8 +71,7 @@ case "${cluster}" in cuda_platform=ON gpu_arch=70 launcher=lsf - CUDA_VER=cuda-12.2.2 - COMPILER_VER=clang-16.0.6 + CUDA_VER=$(basename ${CUDA_HOME}) SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/spectrum-mpi-rolling-release export CMAKE_PREFIX_PATH="${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/nccl_2.20.3-1+cuda12.2_ppc64le:${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/cudnn-linux-ppc64le-8.9.7.29_cuda12-archive:${CMAKE_PREFIX_PATH:-""}" ;; @@ -103,7 +104,6 @@ case "${cluster}" in gpu_arch=gfx906 launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) - COMPILER_VER=clang-14.0.6-magic SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${COMPILER_VER}/openmpi-4.1.2 ;; *) @@ -162,6 +162,7 @@ echo "~~~~~ GPU arch: ${gpu_arch}" echo "~~~~~ Launcher: ${launcher}" echo "~~~~~" echo "~~~~~ Compiler family: ${compiler_family}" +echo "~~~~~ Compiler version: ${COMPILER_VER}" echo "~~~~~ CC: ${CC}" echo "~~~~~ CXX: ${CXX}" echo "~~~~~ CUDACXX: ${CUDACXX}" @@ -179,19 +180,20 @@ echo "----- DISTCONV: \"${build_distconv}\"" echo "----- FFT: \"${build_fft}\"" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +# BVE disable this for now, but look to add it later # Handle cuDNN -if [[ "${cuda_platform}" == "ON" ]] -then - cuda_maj_version=$(basename ${CUDA_HOME} | grep -E --color=no -o "[0-9]+\.[0-9]+\.[0-9]+" | cut -d '.' -f 1) - arch=$(uname -m) - cudnn_root=$(ls -1 -d /usr/workspace/brain/cudnn/cudnn-*/cuda_${cuda_maj_version}_${arch} | tail -1) - if [[ -z "${cudnn_root}" ]] - then - echo "WARNING: No suitable cuDNN found." - else - CMAKE_PREFIX_PATH=${cudnn_root}:${CMAKE_PREFIX_PATH:-""} - fi -fi +# if [[ "${cuda_platform}" == "ON" ]] +# then +# cuda_maj_version=$(basename ${CUDA_HOME} | grep -E --color=no -o "[0-9]+\.[0-9]+\.[0-9]+" | cut -d '.' -f 1) +# arch=$(uname -m) +# cudnn_root=$(ls -1 -d /usr/workspace/brain/cudnn/cudnn-*/cuda_${cuda_maj_version}_${arch} | tail -1) +# if [[ -z "${cudnn_root}" ]] +# then +# echo "WARNING: No suitable cuDNN found." +# else +# CMAKE_PREFIX_PATH=${cudnn_root}:${CMAKE_PREFIX_PATH:-""} +# fi +# fi # Get Breathe, gcovr, and Ninja. Putting this off to the side because # I don't want to tweak "the real" python environment, but it's just From b445d538a87c809fe47ca5749354d24e19fd4dfd Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 21 Aug 2024 16:23:20 -0700 Subject: [PATCH 146/169] Working to consolidate how the core dependencies are built to use the same setup file as the CI runs. Fixed the build issues for CI on corona. Removed scripts for building DHA and LBANN manually (outside of CI). --- .gitlab/build-and-test.sh | 17 -- .gitlab/configure_deps.sh | 2 +- .gitlab/setup_env.sh | 83 ++++----- cmake/configure_files/lbann_pfe.sh.in | 6 +- scripts/superbuild/CMakeLists.txt | 16 +- scripts/superbuild/ci/ci_core_dependencies.sh | 152 +++++++++++++++++ scripts/superbuild/ci/ci_corona_dha.sh | 119 ------------- .../superbuild/ci/ci_corona_dha_distconv.sh | 119 ------------- scripts/superbuild/ci/ci_pascal_dha.sh | 115 ------------- .../superbuild/ci/ci_pascal_dha_distconv.sh | 114 ------------- scripts/superbuild/ci/ci_pascal_dha_half.sh | 113 ------------- scripts/superbuild/ci/ci_tioga_dha.sh | 129 -------------- .../superbuild/ci/ci_tioga_dha_distconv.sh | 129 -------------- scripts/superbuild/ci/ci_tioga_lbann.sh | 157 ------------------ 14 files changed, 201 insertions(+), 1070 deletions(-) create mode 100755 scripts/superbuild/ci/ci_core_dependencies.sh delete mode 100755 scripts/superbuild/ci/ci_corona_dha.sh delete mode 100755 scripts/superbuild/ci/ci_corona_dha_distconv.sh delete mode 100755 scripts/superbuild/ci/ci_pascal_dha.sh delete mode 100755 scripts/superbuild/ci/ci_pascal_dha_distconv.sh delete mode 100755 scripts/superbuild/ci/ci_pascal_dha_half.sh delete mode 100755 scripts/superbuild/ci/ci_tioga_dha.sh delete mode 100755 scripts/superbuild/ci/ci_tioga_dha_distconv.sh delete mode 100755 scripts/superbuild/ci/ci_tioga_lbann.sh diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 7cbbed61409..489026dc097 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -96,12 +96,9 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" dha_prefix=${prefix} -#dha_prefix=${project_dir}/install-deps-tioga_craycc5_distconv_testing -#dha_prefix=${project_dir}/install-deps-tioga_cray5_hip_only_distconv_testing # Just for good measure... export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} -#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this @@ -227,26 +224,12 @@ echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" -#CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann protobuf tqdm numpy scipy" echo ${CMD} ${CMD} -# case "${cluster}" in -# pascal) -# CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann torch" -# echo ${CMD} -# ${CMD} -# ;; -# *) -# echo "Unable to install torch via pip on ${cluster}" -# ;; -# esac - LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles -#echo "I think that the module is in ${LBANN_MODFILES_DIR}" ml use ${LBANN_MODFILES_DIR} ml load lbann -#echo "$(which lbann)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Testing LBANN: $(which lbann)" diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 9598bc568dd..4d01678f335 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -14,7 +14,6 @@ echo "----- DISTCONV: ${build_distconv}" echo "----- FFT: ${build_fft}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -# -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ cmake \ -G Ninja \ -S ${lbann_sb_dir} \ @@ -67,6 +66,7 @@ cmake \ -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 42bfcf63ef6..12e31ecbfab 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -7,37 +7,53 @@ # can set "CC"/"CXX" directly, in which case the # "COMPILER_FAMILY" variable will be ignored. +# Prefer RPATH to RUNPATH (stability over flexibility) +common_linker_flags="-Wl,--disable-new-dtags" +CFLAGS=${CFLAGS:-""} +CXXFLAGS=${CXXFLAGS:-""} +LDFLAGS=${LDFLAGS:-""} +LDFLAGS="${common_linker_flags} ${LDFLAGS}" + compiler_family=${COMPILER_FAMILY:-gnu} case "${compiler_family,,}" in gnu|gcc) CC=${CC:-$(command -v gcc)} CXX=${CXX:-$(command -v g++)} + EXTRA_LINK_FLAGS="-fuse-ld=gold ${common_linker_flags}" ;; clang) CC=${CC:-$(command -v clang)} CXX=${CXX:-$(command -v clang++)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" ;; amdclang) CC=${CC:-$(command -v amdclang)} CXX=${CXX:-$(command -v amdclang++)} + ROCM_VER=$(basename ${ROCM_PATH}) + ROCM_VER_NUM=$(echo "${ROCM_VER}" | tr -d '[a-z]') + COMPILER_VER="amdclang${ROCM_VER_NUM}" + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" ;; cray) CC=${CC:-$(command -v cc)} CXX=${CXX:-$(command -v CC)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" ;; craycc) CC=${CC:-$(command -v craycc)} CXX=${CXX:-$(command -v craycxx)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" ;; *) echo "Unknown compiler family: ${compiler_family}. Using gnu." CC=${CC:-$(command -v gcc)} CXX=${CXX:-$(command -v g++)} + EXTRA_LINK_FLAGS="-fuse-ld=gold ${common_linker_flags}" ;; esac # Set the compiler version based on the path of the compiler -COMPILER_VER=$(basename $(dirname $(dirname $(which ${CC})))) +COMPILER_VER=${COMPILER_VER:-"$(basename $(dirname $(dirname $(which ${CC}))))"} # HIP/CUDA configuration and launcher are platform-specific CUDACXX=${CUDACXX:=""} @@ -48,7 +64,6 @@ rocm_platform=OFF launcher=mpiexec -common_linker_flags="-Wl,--disable-new-dtags" extra_rpaths=${extra_rpaths:-""} # Set to the preferred install directory for the external dependencies @@ -80,6 +95,7 @@ case "${cluster}" in if [[ -n "${cray_libs_dir}" ]] then extra_rpaths="${cray_libs_dir}:${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" + export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} else extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi @@ -89,13 +105,13 @@ case "${cluster}" in launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') - case "${compiler_family,,}" in - craycc) - PE_ENV_lc=${PE_ENV_lc}cc - ;; - *) - ;; - esac + # case "${compiler_family,,}" in + # craycc) + # PE_ENV_lc=${PE_ENV_lc}cc + # ;; + # *) + # ;; + # esac SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-${CRAY_MPICH_VERSION} ;; corona) @@ -111,16 +127,18 @@ case "${cluster}" in esac export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-""} -source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -if [[ "${build_half}" = "ON" ]]; then +ci_core_cmake_prefix_path="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh" +if [[ -e ${ci_core_cmake_prefix_path} ]]; then + source ${ci_core_cmake_prefix_path} +fi +if [[ "${build_half:-""}" = "ON" ]]; then export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} fi -#CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} case "${cluster}" in tioga) ROCM_VER=$(basename ${ROCM_PATH}) if [[ "${ROCM_VER}" = "6.2.0" ]]; then - CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} + CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/${cluster}/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} fi ;; *) @@ -128,28 +146,14 @@ case "${cluster}" in esac CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -CFLAGS=${CFLAGS:-""} -CXXFLAGS=${CXXFLAGS:-""} -LDFLAGS=${LDFLAGS:-""} -LDFLAGS="${common_linker_flags} ${LDFLAGS}" - # Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. +# passed only to the LBANN stack. Add -v for debugging EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" -# EXTRA_CXX_FLAGS="-v -g3 -Wno-deprecated-declarations" -# EXTRA_HIP_FLAGS="-v -g3 -Wno-deprecated-declarations" -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" # Update the location of external packages FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} -# # Set to ON to enable Half support. Only matters if building the -# # LBANN stack. -# BUILD_WITH_HALF=${BUILD_WITH_HALF:-OFF} -# BUILD_WITH_DISTCONV=${BUILD_WITH_DISTCONV:-OFF} - # Make sure the compilers and flags are exported export CC CXX CUDACXX CUDAHOSTCXX CFLAGS CXXFLAGS LDFLAGS echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" @@ -175,30 +179,15 @@ echo "~~~~~ Extra rpaths: ${extra_rpaths}" echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" echo "-----" echo "----- Dependency Flags:" -echo "----- HALF: \"${build_half}\"" -echo "----- DISTCONV: \"${build_distconv}\"" -echo "----- FFT: \"${build_fft}\"" +echo "----- HALF: \"${build_half:-""}\"" +echo "----- DISTCONV: \"${build_distconv:-""}\"" +echo "----- FFT: \"${build_fft:-""}\"" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -# BVE disable this for now, but look to add it later -# Handle cuDNN -# if [[ "${cuda_platform}" == "ON" ]] -# then -# cuda_maj_version=$(basename ${CUDA_HOME} | grep -E --color=no -o "[0-9]+\.[0-9]+\.[0-9]+" | cut -d '.' -f 1) -# arch=$(uname -m) -# cudnn_root=$(ls -1 -d /usr/workspace/brain/cudnn/cudnn-*/cuda_${cuda_maj_version}_${arch} | tail -1) -# if [[ -z "${cudnn_root}" ]] -# then -# echo "WARNING: No suitable cuDNN found." -# else -# CMAKE_PREFIX_PATH=${cudnn_root}:${CMAKE_PREFIX_PATH:-""} -# fi -# fi - # Get Breathe, gcovr, and Ninja. Putting this off to the side because # I don't want to tweak "the real" python environment, but it's just # these one or two things so it's not worth a venv. -if [[ -n "${run_coverage}" ]] +if [[ -n "${run_coverage:-""}" ]] then python_pkgs="ninja gcovr" else diff --git a/cmake/configure_files/lbann_pfe.sh.in b/cmake/configure_files/lbann_pfe.sh.in index e562add65c6..5bcd9cf6953 100644 --- a/cmake/configure_files/lbann_pfe.sh.in +++ b/cmake/configure_files/lbann_pfe.sh.in @@ -25,6 +25,8 @@ ################################################################################ #!/bin/sh export PYTHONPATH=@LBANN_PFE_PYTHONPATH@:${PYTHONPATH} -export AWS_OFI_RCCL_LIBRARY=@AWS_OFI_RCCL_LIBRARY@ -export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBRARY@:${LD_LIBRARY_PATH} +if [[ -e "@AWS_OFI_RCCL_LIBRARY@" ]]; then + export AWS_OFI_RCCL_LIBRARY=@AWS_OFI_RCCL_LIBRARY@ + export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBRARY@:${LD_LIBRARY_PATH} +fi @LBANN_PFE_PYTHON_EXECUTABLE@ "$@" diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index b4c46554a12..269e7e154a5 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -153,15 +153,15 @@ if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("have an effect. It may be useful to do the following:\n") message("export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBARY_PATH\}\n") message("-----------------------------------------------------------------\n") + file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") + file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") endif () -file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" - "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") -file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" - "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") -file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" - "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") -file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" - "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") if (LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND) message("-----------------------------------------------------------------\n") diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh new file mode 100755 index 00000000000..fddc2519364 --- /dev/null +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -0,0 +1,152 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Example usage: COMPILER_FAMILY= ./scripts/superbuild/ci/ci_core_dependencies.sh +# Initialize modules for users not using bash as a default shell +modules_home=${MODULESHOME:-"/usr/share/lmod/lmod"} +if [[ -e ${modules_home}/init/bash ]] +then + source ${modules_home}/init/bash +fi + +set -o errexit +set -o nounset + +hostname="$(hostname)" +cluster=${hostname//[0-9]/} +project_dir="$(git rev-parse --show-toplevel)" +if [[ $? -eq 1 ]] +then + project_dir="$(pwd)" +fi +# Finish setting up the environment +source ${project_dir}/.gitlab/setup_env.sh + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +case "${cluster}" in + tioga) + # Set to ON if you're on a Cray machine that doesn't provide the AWS + # plugin as part of its default RCCL installation. + # + # It might also be advisable to build this if you build a custom RCCL. + # The configuration script takes a RCCL path as a parameter, so it + # could matter, but it's not clear how much. + aws_ofi_plugin="-D LBANN_SB_BUILD_AWS_OFI_RCCL=ON" + ;; + *) + aws_ofi_plugin="" + ;; +esac + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +#source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Use an accessible build directory so that the source files are preserved for debuggin +BUILD_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/.build/${cluster}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} + +if [ ! -e ${INSTALL_PREFIX} ]; then + mkdir -p ${INSTALL_PREFIX} +fi + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies + +# The compilers are set via CC, CXX environment variables +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/|}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/|}" \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ + \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=ON \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + ${aws_ofi_plugin} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenCV_TAG=4.x + +# Save a list of the currently loaded modules +if [ ! -e ${INSTALL_PREFIX}/logs ]; then + mkdir -p ${INSTALL_PREFIX}/logs +fi +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt + +pushd ${BUILD_DIR} +ninja +popd diff --git a/scripts/superbuild/ci/ci_corona_dha.sh b/scripts/superbuild/ci/ci_corona_dha.sh deleted file mode 100755 index 007a949a449..00000000000 --- a/scripts/superbuild/ci/ci_corona_dha.sh +++ /dev/null @@ -1,119 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=OFF - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_corona_dha_distconv.sh b/scripts/superbuild/ci/ci_corona_dha_distconv.sh deleted file mode 100755 index c81dceec739..00000000000 --- a/scripts/superbuild/ci/ci_corona_dha_distconv.sh +++ /dev/null @@ -1,119 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=ON - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha.sh b/scripts/superbuild/ci/ci_pascal_dha.sh deleted file mode 100755 index ebd1cc9927b..00000000000 --- a/scripts/superbuild/ci/ci_pascal_dha.sh +++ /dev/null @@ -1,115 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=OFF - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D CMAKE_CXX_COMPILER=$(which clang++) \ - -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_CUDA_STANDARD=17 \ - -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh b/scripts/superbuild/ci/ci_pascal_dha_distconv.sh deleted file mode 100755 index f16e65dc45c..00000000000 --- a/scripts/superbuild/ci/ci_pascal_dha_distconv.sh +++ /dev/null @@ -1,114 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=ON - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D CMAKE_CXX_COMPILER=$(which clang++) \ - -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_CUDA_STANDARD=17 \ - -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_dha_half.sh b/scripts/superbuild/ci/ci_pascal_dha_half.sh deleted file mode 100755 index 0243b15dabc..00000000000 --- a/scripts/superbuild/ci/ci_pascal_dha_half.sh +++ /dev/null @@ -1,113 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=OFF - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_half - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-half - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D CMAKE_CXX_COMPILER=$(which clang++) \ - -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_CUDA_STANDARD=17 \ - -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_CUDA_FLAGS="${EXTRA_CUDA_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_dha.sh b/scripts/superbuild/ci/ci_tioga_dha.sh deleted file mode 100755 index 6b040488ce8..00000000000 --- a/scripts/superbuild/ci/ci_tioga_dha.sh +++ /dev/null @@ -1,129 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=OFF - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to ON if you're on a Cray machine that doesn't provide the AWS -# plugin as part of its default RCCL installation. -# -# It might also be advisable to build this if you build a custom RCCL. -# The configuration script takes a RCCL path as a parameter, so it -# could matter, but it's not clear how much. -BUILD_AWS_OFI_RCCL_PLUGIN=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh b/scripts/superbuild/ci/ci_tioga_dha_distconv.sh deleted file mode 100755 index 43d09e6b709..00000000000 --- a/scripts/superbuild/ci/ci_tioga_dha_distconv.sh +++ /dev/null @@ -1,129 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=ON - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to ON if you're on a Cray machine that doesn't provide the AWS -# plugin as part of its default RCCL installation. -# -# It might also be advisable to build this if you build a custom RCCL. -# The configuration script takes a RCCL path as a parameter, so it -# could matter, but it's not clear how much. -BUILD_AWS_OFI_RCCL_PLUGIN=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha_with_distconv - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-distconv - -# Update the location of external packages -source ${INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_Aluminum=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ - -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ - -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_Hydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${BUILD_WITH_HALF} \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ - -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ - -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ - \ - -D LBANN_SB_BUILD_DiHydrogen=${BUILD_LBANN_STACK} \ - -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_lbann.sh b/scripts/superbuild/ci/ci_tioga_lbann.sh deleted file mode 100755 index d1fdf82370a..00000000000 --- a/scripts/superbuild/ci/ci_tioga_lbann.sh +++ /dev/null @@ -1,157 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -################################################################ -# Parse command-line arguments -################################################################ - -# while :; do -# case ${1} in -# -h|--help) -# # Help message -# help_message -# exit 1 -# ;; -# --install_prefix) -# if [ -n "${2}" ]; then -# INSTALL_PREFIX=${2} -# shift -# else -# echo "\"${1}\" option requires a non-empty option argument" >&2 -# exit 1 -# fi -# ;; -# *) -# # Break loop if there are no more options -# break -# esac -# shift -# done - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=OFF - -# Set to ON to enable Half support. Only matters if building the -# LBANN stack. -BUILD_WITH_HALF=OFF - -# Set to ON if you're on a Cray machine that doesn't provide the AWS -# plugin as part of its default RCCL installation. -# -# It might also be advisable to build this if you build a custom RCCL. -# The configuration script takes a RCCL path as a parameter, so it -# could matter, but it's not clear how much. -BUILD_AWS_OFI_RCCL_PLUGIN=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh - -# Set to the preferred install directory -#INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS}/dha - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-dha-lbann-${PE_ENV_lc}-${ROCM_VER} - -# Set to the preferred install directory -INSTALL_PREFIX=${LBANN_SRC_DIR}/superbuild/install_${ROCM_VER} - -# Update the location of external packages -#source ${INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh -source ${INSTALL_PREFIX_EXTERNALS}/dha/logs/lbann_sb_suggested_cmake_prefix_path.sh -#export CMAKE_PREFIX_PATH=${INSTALL_PREFIX}/half-2.1.0:${CMAKE_PREFIX_PATH} -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} - -#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} -# -D CMAKE_BUILD_RPATH=${EXTRA_RPATHS} \ -#-DCMAKE_BUILD_RPATH=${EXTRA_RPATHS} -#-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -# -D CMAKE_BUILD_WITH_INSTALL_RPATH=ON \ - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_LBANN=${BUILD_LBANN_STACK} \ - -D LBANN_SB_LBANN_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_LBANN_SOURCE_DIR=${LBANN_SRC_DIR} \ - -D LBANN_SB_FWD_LBANN_CMAKE_EXPORT_COMPILE_COMMANDS=ON \ - -D LBANN_SB_LBANN_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ - -D LBANN_SB_LBANN_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ - -D LBANN_SB_FWD_LBANN_LBANN_DATATYPE=float \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_CALIPER=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_DISTCONV=${BUILD_WITH_DISTCONV} \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_TBINF=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_UNIT_TESTING=ON \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_CNPY=ON \ - -D LBANN_SB_FWD_LBANN_LBANN_DETERMINISTIC=ON \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_ADDRESS_SANITIZER=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_NVSHMEM=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_FFT=OFF \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_EMBEDDED_PYTHON=ON \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND=ON \ - -D LBANN_SB_FWD_LBANN_LBANN_WITH_VISION=ON \ - -D LBANN_SB_FWD_LBANN_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} From 4edfe32f6cf67736e1b0ab083ac7d5edfb87c6c4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 21 Aug 2024 16:39:40 -0700 Subject: [PATCH 147/169] Cleaning up Power and HIP specific flags. --- scripts/superbuild/ci/ci_core_dependencies.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh index fddc2519364..b665e9fc48b 100755 --- a/scripts/superbuild/ci/ci_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -58,9 +58,17 @@ case "${cluster}" in # The configuration script takes a RCCL path as a parameter, so it # could matter, but it's not clear how much. aws_ofi_plugin="-D LBANN_SB_BUILD_AWS_OFI_RCCL=ON" + BUILD_ROCM_TPLS="ON" + ;; + corona) + BUILD_ROCM_TPLS="ON" + ;; + lassen) + power9_flags="-D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ + -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF" ;; *) - aws_ofi_plugin="" ;; esac @@ -126,7 +134,7 @@ cmake \ -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hiptt=${BUILD_ROCM_TPLS:-"OFF"} \ -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ @@ -135,10 +143,11 @@ cmake \ -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ \ - ${aws_ofi_plugin} \ + ${aws_ofi_plugin:-""} \ \ -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + ${power9_flags:=""} \ -D LBANN_SB_OpenCV_TAG=4.x # Save a list of the currently loaded modules From 54a0fd53f1e405e6c22c47fb14f1c114f7870543 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 22 Aug 2024 15:36:38 -0700 Subject: [PATCH 148/169] Added support for creating a Python virtual environment in the CI stack. Improved the core dependencies for Power. --- .gitlab/setup_env.sh | 12 +- scripts/superbuild/ci/ci_core_dependencies.sh | 2 + .../ci/ci_corona_core_dependencies.sh | 106 ------------------ scripts/superbuild/ci/ci_corona_env.sh | 53 --------- .../ci/ci_lassen_core_dependencies.sh | 103 ----------------- scripts/superbuild/ci/ci_lassen_env.sh | 53 --------- 6 files changed, 13 insertions(+), 316 deletions(-) delete mode 100755 scripts/superbuild/ci/ci_corona_core_dependencies.sh delete mode 100644 scripts/superbuild/ci/ci_corona_env.sh delete mode 100755 scripts/superbuild/ci/ci_lassen_core_dependencies.sh delete mode 100755 scripts/superbuild/ci/ci_lassen_env.sh diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 12e31ecbfab..96ab96fd955 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -194,9 +194,19 @@ else python_pkgs="ninja" fi +VENV_DIR="${TMPDIR}/${USER}/lbann_venv" +CMD="python3 -m venv ${VENV_DIR}" +echo "${CMD}" +${CMD} +CMD="source ${VENV_DIR}/bin/activate" +echo "${CMD}" +${CMD} + export PYTHONUSERBASE=${TMPDIR}/${USER}/python/${cluster} export PATH=${PYTHONUSERBASE}/bin:${PATH} -python3 -m pip install --user ${python_pkgs} +CMD="python3 -m pip install --prefix ${PYTHONUSERBASE} ${python_pkgs}" +echo "${CMD}" +${CMD} # Make sure the PYTHONPATH is all good. export PYTHONPATH=$(ls --color=no -1 -d ${PYTHONUSERBASE}/lib/python*/site-packages | paste -sd ":" - ):${PYTHONPATH:-""} diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh index b665e9fc48b..3076542cceb 100755 --- a/scripts/superbuild/ci/ci_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -42,6 +42,7 @@ if [[ $? -eq 1 ]] then project_dir="$(pwd)" fi + # Finish setting up the environment source ${project_dir}/.gitlab/setup_env.sh @@ -66,6 +67,7 @@ case "${cluster}" in lassen) power9_flags="-D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_FWD_OpenCV_WITH_OPENJPEG=OFF \ -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF" ;; *) diff --git a/scripts/superbuild/ci/ci_corona_core_dependencies.sh b/scripts/superbuild/ci/ci_corona_core_dependencies.sh deleted file mode 100755 index 1b3f6f96377..00000000000 --- a/scripts/superbuild/ci/ci_corona_core_dependencies.sh +++ /dev/null @@ -1,106 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_corona_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ - \ - -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_OpenCV_TAG=4.x - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_corona_env.sh b/scripts/superbuild/ci/ci_corona_env.sh deleted file mode 100644 index b7841caf522..00000000000 --- a/scripts/superbuild/ci/ci_corona_env.sh +++ /dev/null @@ -1,53 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" - -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" -# If using PrgEnv-cray add ${CRAYLIBS_X86_64} -EXTRA_RPATHS="${ROCM_PATH}/lib|${ROCM_PATH}/llvm/lib" - -# Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) -AMD_GPU_ARCH=gfx906 - -ROCM_VER=$(basename ${ROCM_PATH}) -COMPILER_VER=clang-14.0.6-magic -# Set to the preferred install directory -CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies -INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/corona/${ROCM_VER} -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/openmpi-4.1.2 - -# Use an accessible build directory so that the source files are preserved for debugging -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/corona/${ROCM_VER}/${COMPILER_VER} - -# Location of external packages -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} - -#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} diff --git a/scripts/superbuild/ci/ci_lassen_core_dependencies.sh b/scripts/superbuild/ci/ci_lassen_core_dependencies.sh deleted file mode 100755 index 2d00ca3eea3..00000000000 --- a/scripts/superbuild/ci/ci_lassen_core_dependencies.sh +++ /dev/null @@ -1,103 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_lassen_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies - -#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D CMAKE_CXX_COMPILER=$(which clang++) \ - -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_CUDA_STANDARD=17 \ - -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ - \ - -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ - \ - -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ - -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ - -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF \ - -D LBANN_SB_OpenCV_TAG=4.x \ - -D LBANN_SB_OpenCV_TAG=4.x - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_lassen_env.sh b/scripts/superbuild/ci/ci_lassen_env.sh deleted file mode 100755 index ab5d785b909..00000000000 --- a/scripts/superbuild/ci/ci_lassen_env.sh +++ /dev/null @@ -1,53 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" - -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" -EXTRA_LINK_FLAGS="-fuse-ld=lld ${EXTRA_LINK_FLAGS_CORE}" - -# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) -CUDA_GPU_ARCH=70 - -#CUDA_VER=cuda-11.8.0 -CUDA_VER=cuda-12.2.2 -COMPILER_VER=clang-16.0.6 -#COMPILER_VER=clang-14.0.6-magic -# Set to the preferred install directory -CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies -INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/lassen/${CUDA_VER} -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/spectrum-mpi-rolling-release - -# Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/lassen/${CUDA_VER}/${COMPILER_VER} - -# Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} From b7e09bdad15d6ca3b5a154fb6a8e4fee7a06eec7 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 22 Aug 2024 15:42:43 -0700 Subject: [PATCH 149/169] Removed older core platform specific dependency scripts. --- .../ci/ci_pascal_core_dependencies.sh | 99 ------------- scripts/superbuild/ci/ci_pascal_env.sh | 51 ------- .../ci/ci_tioga_core_dependencies.sh | 130 ------------------ scripts/superbuild/ci/ci_tioga_env.sh | 64 --------- 4 files changed, 344 deletions(-) delete mode 100755 scripts/superbuild/ci/ci_pascal_core_dependencies.sh delete mode 100644 scripts/superbuild/ci/ci_pascal_env.sh delete mode 100755 scripts/superbuild/ci/ci_tioga_core_dependencies.sh delete mode 100644 scripts/superbuild/ci/ci_tioga_env.sh diff --git a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh b/scripts/superbuild/ci/ci_pascal_core_dependencies.sh deleted file mode 100755 index 2239fc5f60c..00000000000 --- a/scripts/superbuild/ci/ci_pascal_core_dependencies.sh +++ /dev/null @@ -1,99 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_pascal_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies - -#export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - \ - -D CMAKE_C_COMPILER=$(which clang) \ - -D CMAKE_CXX_COMPILER=$(which clang++) \ - -D CMAKE_CUDA_COMPILER=$(command -v nvcc) \ - -D CMAKE_CUDA_HOST_COMPILER=$(command -v clang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS_CORE}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_CUDA_STANDARD=17 \ - -D CMAKE_CUDA_ARCHITECTURES=${CUDA_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_CUDA_OPTS=ON \ - \ - -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ - \ - -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_OpenCV_TAG=4.x - -# Save a list of the currently loaded modules -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_pascal_env.sh b/scripts/superbuild/ci/ci_pascal_env.sh deleted file mode 100644 index 6118315ddf2..00000000000 --- a/scripts/superbuild/ci/ci_pascal_env.sh +++ /dev/null @@ -1,51 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_CUDA_FLAGS="-g3 -Wno-deprecated-declarations" - -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS_CORE="-Wl,--disable-new-dtags" -EXTRA_LINK_FLAGS="-fuse-ld=lld ${EXTRA_LINK_FLAGS_CORE}" - -# Set this to the CUDA GPU arch(s) to support (example set for Lassen/Sierra) -CUDA_GPU_ARCH=60 - -CUDA_VER=cuda-11.8.0 -COMPILER_VER=clang-14.0.6-magic -# Set to the preferred install directory -CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies -INSTALL_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/pascal/${CUDA_VER} -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/${COMPILER_VER}/openmpi-4.1.2 - -# Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/pascal/${CUDA_VER}/${COMPILER_VER} - -# Location of external packages -export CMAKE_PREFIX_PATH=${INSTALL_ROOT}/cudnn-8.9.4:${INSTALL_ROOT}/nccl-2.19.4:${INSTALL_ROOT}/../../cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} diff --git a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh b/scripts/superbuild/ci/ci_tioga_core_dependencies.sh deleted file mode 100755 index 2a9db32eed6..00000000000 --- a/scripts/superbuild/ci/ci_tioga_core_dependencies.sh +++ /dev/null @@ -1,130 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -python_pkgs="ninja" -export PYTHONUSERBASE=${TMPDIR}/${USER}/python/${cluster} -export PATH=${PYTHONUSERBASE}/bin:${PATH} -python3 -m pip install --user ${python_pkgs} -# Make sure the PYTHONPATH is all good. -export PYTHONPATH=$(ls --color=no -1 -d ${PYTHONUSERBASE}/lib/python*/site-packages | paste -sd ":" - ):${PYTHONPATH:-""} - -# Set to ON (or any CMake truthy value) to build all of the -# dependencies of the LBANN stack -BUILD_EXTERNAL_TPLS=ON - -# Set to ON to build Aluminum, Hydrogen, DiHydrogen, and LBANN -BUILD_LBANN_STACK=ON - -# Set to ON to enable DistConv support. Only matters if building the -# LBANN stack. -BUILD_WITH_DISTCONV=ON - -# Set to ON if you're on a Cray machine that doesn't provide the AWS -# plugin as part of its default RCCL installation. -# -# It might also be advisable to build this if you build a custom RCCL. -# The configuration script takes a RCCL path as a parameter, so it -# could matter, but it's not clear how much. -BUILD_AWS_OFI_RCCL_PLUGIN=ON - -# Set to the directory with the top-level CMakeLists.txt file for LBANN -LBANN_SRC_DIR=$(git rev-parse --show-toplevel) - -# Set to the directory with the top-level SuperBuild CMakeLists.txt file -SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild - -# Setup the common environment -source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh - -# Set to the preferred install directory -INSTALL_PREFIX=${INSTALL_PREFIX_EXTERNALS} - -if [ ! -e ${INSTALL_PREFIX} ]; then - mkdir -p ${INSTALL_PREFIX} -fi - -# Set to the preferred build directory -BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies - -cmake \ - -G Ninja \ - -S ${SUPERBUILD_SRC_DIR} \ - -B ${BUILD_DIR} \ - \ - -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ - -D CMAKE_BUILD_TYPE=Release \ - -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ - -D CMAKE_INSTALL_RPATH="${EXTRA_RPATHS}" \ - -D CMAKE_BUILD_RPATH="${EXTRA_RPATHS}" \ - \ - -D CMAKE_C_COMPILER=$(which amdclang) \ - -D CMAKE_CXX_COMPILER=$(which amdclang++) \ - -D CMAKE_Fortran_COMPILER=$(which gfortran) \ - \ - -D BUILD_SHARED_LIBS=ON \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ - \ - -D CMAKE_CXX_STANDARD=17 \ - -D CMAKE_HIP_STANDARD=17 \ - -D CMAKE_HIP_ARCHITECTURES=${AMD_GPU_ARCH} \ - \ - -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ - \ - -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ - -D LBANN_SB_DEFAULT_ROCM_OPTS=ON \ - \ - -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Caliper=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ - -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ - \ - -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_hiptt=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ - \ - -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ - -D LBANN_SB_OpenCV_TAG=4.x \ - \ - -D LBANN_SB_BUILD_AWS_OFI_RCCL=${BUILD_AWS_OFI_RCCL_PLUGIN}} - -# Save a list of the currently loaded modules -if [ ! -e ${INSTALL_PREFIX}/logs ]; then - mkdir -p ${INSTALL_PREFIX}/logs -fi -module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh deleted file mode 100644 index 679a35b9a98..00000000000 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ /dev/null @@ -1,64 +0,0 @@ -################################################################################ -## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Improve debugging info and remove some misguided warnings. These are -# passed only to the LBANN stack. -EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" -EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" - -# Prefer RPATH to RUNPATH (stability over flexibility) -EXTRA_LINK_FLAGS="-fuse-ld=lld -Wl,--disable-new-dtags" -# If using PrgEnv-cray add ${CRAYLIBS_X86_64} -EXTRA_RPATHS="${ROCM_PATH}/lib|${ROCM_PATH}/llvm/lib" - -# Set this to the AMD GPU arch(s) to support (example set for Crusher/Frontier/Tioga) -AMD_GPU_ARCH=gfx90a,gfx942 - -# Set to the preferred install directory -#INSTALL_PREFIX=${PWD}/install-rocm-distconv -ROCM_VER=$(basename ${ROCM_PATH}) -PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') -INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER}/${PE_ENV_lc} -INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-${CRAY_MPICH_VERSION} - -if [[ "${PE_ENV_lc}" = "cray" ]]; then - # If using PrgEnv-cray add ${CRAYLIBS_X86_64} - EXTRA_RPATHS="${CRAYLIBS_X86_64}|${EXTRA_RPATHS}" - export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} -fi - -# Use an accessible build directory so that the source files are preserved for debuggin -BUILD_ROOT=/usr/workspace/lbann/ci_stable_dependencies/.build/tioga/${ROCM_VER}/${PE_ENV_lc} - -# Location of external packages -#export CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen -CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} -# Patch up the header while dealing with MiOpen -#export CPLUS_INCLUDE_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen/include:${CPLUS_INCLUDE_PATH} - -#CPLUS_INCLUDE_PATH=/p/vast1/lbann/stable_dependencies/tioga/rocm-6.1.0/miopen/include:${CPLUS_INCLUDE_PATH} - -export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} From 74ea19e3217415e20305a9f0bcf9dd9e9c7cb6bb Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Thu, 22 Aug 2024 15:47:40 -0700 Subject: [PATCH 150/169] Update python/lbann/contrib/lc/launcher.py Co-authored-by: Tom Benson --- python/lbann/contrib/lc/launcher.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index fd0b2ea367d..5a0231acaac 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -96,8 +96,7 @@ def prepend_environment_path(key, prefix): # Optimizations for Tioga if system in ('tioga', 'rzvernal'): #set_environment('NCCL_SOCKET_IFNAME', 'hsi') - set_environment('NCCL_NET_GDR_LEVEL', '2') # From HPE to avoid hangs -# set_environment('FI_CXI_ATS', '0') # From Adam Moody Dist PyTorch on Tioga + set_environment('NCCL_NET_GDR_LEVEL', '3') # From HPE to avoid hangs set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '0') set_environment('MIOPEN_DISABLE_CACHE', '0') tmpdir = os.environ.get('TMPDIR') From df40eeb92b4c0a6c6583283a984474f49e8576c4 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 22 Aug 2024 16:42:58 -0700 Subject: [PATCH 151/169] Add pytest to the venv. Cleaned up. --- .gitlab/build-and-test-corona.yml | 2 +- .gitlab/build-and-test-lassen.yml | 2 +- .gitlab/build-and-test-pascal.yml | 3 --- .gitlab/build-and-test-tioga.yml | 4 ++-- .gitlab/build-and-test.sh | 3 ++- .gitlab/setup_env.sh | 14 +++++++------- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/.gitlab/build-and-test-corona.yml b/.gitlab/build-and-test-corona.yml index 07858c15d12..a9bfa269783 100644 --- a/.gitlab/build-and-test-corona.yml +++ b/.gitlab/build-and-test-corona.yml @@ -45,7 +45,7 @@ rocm-5-7-1-distconv-corona: variables: COMPILER_FAMILY: amdclang MODULES: "rocm/5.7.1 clang/14.0.6-magic openmpi/4.1.2" - WITH_DISTCONV: "1" + WITH_DISTCONV: "ON" extends: .build-and-test-on-corona .build-and-test-on-corona: diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index a0409640e43..f372ea85723 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -46,7 +46,7 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: variables: COMPILER_FAMILY: clang MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" - WITH_DISTCONV: "1" + WITH_DISTCONV: "ON" extends: .build-and-test-on-lassen .build-and-test-on-lassen: diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml index 0e31bf307ab..5047a3a299a 100644 --- a/.gitlab/build-and-test-pascal.yml +++ b/.gitlab/build-and-test-pascal.yml @@ -40,15 +40,12 @@ clang-14-0-6-cuda-11-8-0-pascal: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" WITH_HALF: "ON" - WITH_FFT: "" extends: [.build-and-test-on-pascal, .build-and-test] clang-14-0-6-cuda-11-8-0-distconv-pascal: variables: COMPILER_FAMILY: clang MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" - WITH_HALF: "" - WITH_FFT: "" WITH_DISTCONV: "ON" extends: [.build-and-test-on-pascal, .build-and-test] diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 19e1940ee24..db8569606a0 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -45,7 +45,7 @@ rocm-5-7-1-distconv-tioga: variables: COMPILER_FAMILY: amdclang MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" - WITH_DISTCONV: "1" + WITH_DISTCONV: "ON" extends: .build-and-test-on-tioga @@ -59,7 +59,7 @@ rocm-6-2-0-distconv-tioga: variables: COMPILER_FAMILY: amdclang MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" - WITH_DISTCONV: "1" + WITH_DISTCONV: "ON" extends: .build-and-test-on-tioga .build-and-test-on-tioga: diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 489026dc097..4f9f846ad6b 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -223,7 +223,8 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +#CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann pytest protobuf tqdm numpy" echo ${CMD} ${CMD} diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 96ab96fd955..f20900d90eb 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -105,13 +105,13 @@ case "${cluster}" in launcher=flux ROCM_VER=$(basename ${ROCM_PATH}) PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') - # case "${compiler_family,,}" in - # craycc) - # PE_ENV_lc=${PE_ENV_lc}cc - # ;; - # *) - # ;; - # esac + case "${compiler_family,,}" in + craycc) + PE_ENV_lc=${PE_ENV_lc}cc + ;; + *) + ;; + esac SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-${CRAY_MPICH_VERSION} ;; corona) From 55431d0d3f26319659a968f12e7e70800223d9e6 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 23 Aug 2024 00:05:09 -0700 Subject: [PATCH 152/169] Added code to build OpenBLAS on Power and then install standard libraries via PIP in the stable dependencies. --- .gitlab/setup_env.sh | 4 ++-- scripts/superbuild/ci/ci_core_dependencies.sh | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index f20900d90eb..392792339fd 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -194,7 +194,7 @@ else python_pkgs="ninja" fi -VENV_DIR="${TMPDIR}/${USER}/lbann_venv" +VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" CMD="python3 -m venv ${VENV_DIR}" echo "${CMD}" ${CMD} @@ -202,7 +202,7 @@ CMD="source ${VENV_DIR}/bin/activate" echo "${CMD}" ${CMD} -export PYTHONUSERBASE=${TMPDIR}/${USER}/python/${cluster} +export PYTHONUSERBASE="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/python" export PATH=${PYTHONUSERBASE}/bin:${PATH} CMD="python3 -m pip install --prefix ${PYTHONUSERBASE} ${python_pkgs}" echo "${CMD}" diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh index 3076542cceb..754d0f47fd1 100755 --- a/scripts/superbuild/ci/ci_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -68,7 +68,12 @@ case "${cluster}" in power9_flags="-D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ -D LBANN_SB_FWD_OpenCV_WITH_OPENJPEG=OFF \ - -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF" + -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF \ + \ + -D LBANN_SB_BUILD_OpenBLAS=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenBLAS_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ + -D LBANN_SB_OpenBLAS_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_OpenBLAS_Fortran_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gfortran" ;; *) ;; @@ -161,3 +166,12 @@ module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt pushd ${BUILD_DIR} ninja popd + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Installing Python Packages with PIP" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${PYTHONUSERBASE} pytest protobuf tqdm numpy scipy" +echo ${CMD} +${CMD} From cafa97c5185363746ea5215b424bd542696aa81b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 23 Aug 2024 09:24:12 -0700 Subject: [PATCH 153/169] Only create the virtual environment if it doesn't exist. --- .gitlab/setup_env.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 392792339fd..0867c897089 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -195,9 +195,11 @@ else fi VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" -CMD="python3 -m venv ${VENV_DIR}" -echo "${CMD}" -${CMD} +if [[ ! -e "${VENV_DIR}/pyvenv.cfg" ]]; + CMD="python3 -m venv ${VENV_DIR}" + echo "${CMD}" + ${CMD} +fi CMD="source ${VENV_DIR}/bin/activate" echo "${CMD}" ${CMD} From cf616f74b36c3be6ddd324e501c1306b8fc0aa0b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 23 Aug 2024 09:28:57 -0700 Subject: [PATCH 154/169] Fix typo. --- .gitlab/setup_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 0867c897089..c7ab71a8073 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -195,7 +195,7 @@ else fi VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" -if [[ ! -e "${VENV_DIR}/pyvenv.cfg" ]]; +if [[ ! -e "${VENV_DIR}/pyvenv.cfg" ]]; then CMD="python3 -m venv ${VENV_DIR}" echo "${CMD}" ${CMD} From c310bdfb524d15ddd81aea677f6bc0e4a501220a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 23 Aug 2024 13:01:52 -0700 Subject: [PATCH 155/169] Changed to installing all of the PIP installs in the virtual env directory. --- .gitlab/setup_env.sh | 23 ++++--------------- scripts/superbuild/ci/ci_core_dependencies.sh | 11 ++++++--- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index c7ab71a8073..839e0359de8 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -184,16 +184,6 @@ echo "----- DISTCONV: \"${build_distconv:-""}\"" echo "----- FFT: \"${build_fft:-""}\"" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -# Get Breathe, gcovr, and Ninja. Putting this off to the side because -# I don't want to tweak "the real" python environment, but it's just -# these one or two things so it's not worth a venv. -if [[ -n "${run_coverage:-""}" ]] -then - python_pkgs="ninja gcovr" -else - python_pkgs="ninja" -fi - VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" if [[ ! -e "${VENV_DIR}/pyvenv.cfg" ]]; then CMD="python3 -m venv ${VENV_DIR}" @@ -204,11 +194,8 @@ CMD="source ${VENV_DIR}/bin/activate" echo "${CMD}" ${CMD} -export PYTHONUSERBASE="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/python" -export PATH=${PYTHONUSERBASE}/bin:${PATH} -CMD="python3 -m pip install --prefix ${PYTHONUSERBASE} ${python_pkgs}" -echo "${CMD}" -${CMD} - -# Make sure the PYTHONPATH is all good. -export PYTHONPATH=$(ls --color=no -1 -d ${PYTHONUSERBASE}/lib/python*/site-packages | paste -sd ":" - ):${PYTHONPATH:-""} +if ! pip3 show ninja 1>/dev/null; then + CMD="python3 -m pip install ninja" + echo "${CMD}" + ${CMD} +fi diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh index 754d0f47fd1..fe17d2d9ab3 100755 --- a/scripts/superbuild/ci/ci_core_dependencies.sh +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -172,6 +172,11 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${PYTHONUSERBASE} pytest protobuf tqdm numpy scipy" -echo ${CMD} -${CMD} +for p in pytest protobuf tqdm numpy scipy; +do + if ! pip3 show ${p} 1>/dev/null; then + CMD="python3 -m pip install -i https://pypi.org/simple ${p}" + echo ${CMD} + ${CMD} + fi +done From 2a6fa093c0658ff0ebdbe69e6b521ca35653e60a Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Fri, 23 Aug 2024 13:49:35 -0700 Subject: [PATCH 156/169] Cleanup. --- .gitlab-ci.yml | 108 ------------------------------ .gitlab/build-and-test-common.yml | 13 +--- scripts/build_lbann.sh | 2 - scripts/superbuild_externals.sh | 7 -- 4 files changed, 1 insertion(+), 129 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6ddc24652f1..fa7d2430360 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,117 +32,9 @@ include: - project: 'lc-templates/id_tokens' file: 'id_tokens.yml' -variables: - FF_USE_NEW_BASH_EVAL_STRATEGY: 'true' - FF_ENABLE_BASH_EXIT_CODE_CHECK: 1 - LBANN_CI_CLEAN_BUILD: 'true' - stages: - run-all-clusters -# corona testing: -# stage: run-all-clusters -# variables: -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/corona/pipeline.yml - -# corona distconv testing: -# stage: run-all-clusters -# variables: -# JOB_NAME_SUFFIX: _distconv -# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" -# SPACK_SPECS: "+rocm +distconv" -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# TEST_FLAG: "test_*_distconv.py" -# trigger: -# strategy: depend -# include: .gitlab/corona/pipeline.yml - -# lassen testing: -# stage: run-all-clusters -# variables: -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/lassen/pipeline.yml - -# lassen distconv testing: -# stage: run-all-clusters -# variables: -# JOB_NAME_SUFFIX: _distconv -# SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv" -# SPACK_SPECS: "+cuda +distconv +fft" -# # SPACK_SPECS: "+cuda +distconv +nvshmem +fft" -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# TEST_FLAG: "test_*_distconv.py" -# trigger: -# strategy: depend -# include: .gitlab/lassen/multi_stage_pipeline.yml - -# pascal testing: -# stage: run-all-clusters -# variables: -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/pascal/single_pipeline.yml -# # include: .gitlab/pascal/pipeline.yml - -# pascal compiler testing: -# stage: run-all-clusters -# variables: -# SPACK_SPECS: "+cuda +half +fft" -# # SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" -# BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/pascal/pipeline_compiler_tests.yml - -# pascal distconv testing: -# stage: run-all-clusters -# variables: -# JOB_NAME_SUFFIX: _distconv -# SPACK_SPECS: "+cuda +distconv +fft" -# # SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" -# BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# TEST_FLAG: "test_*_distconv.py" -# trigger: -# strategy: depend -# include: .gitlab/pascal/pipeline.yml - -# tioga testing: -# stage: run-all-clusters -# variables: -# # FF_USE_NEW_BASH_EVAL_STRATEGY: 1 -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# trigger: -# strategy: depend -# include: .gitlab/tioga/pipeline.yml - -# tioga distconv testing: -# stage: run-all-clusters -# variables: -# JOB_NAME_SUFFIX: _distconv -# SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" -# SPACK_SPECS: "+rocm +distconv" -# WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" -# WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" -# TEST_FLAG: "test_*_distconv.py" -# trigger: -# strategy: depend -# include: .gitlab/tioga/pipeline.yml - tioga testing: stage: run-all-clusters trigger: diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index 7438828571d..b6d0459e866 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -49,18 +49,7 @@ - "${CI_PROJECT_DIR}/*junit.*xml" - "${CI_PROJECT_DIR}/ci_environment.log" - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" - reports: - junit: "${CI_PROJECT_DIR}/*junit.*xml" - extends: .build-and-test-base - -.build-and-test-coverage: - artifacts: - when: always - paths: - - "${CI_PROJECT_DIR}/*junit.*xml" - - "${CI_PROJECT_DIR}/ci_environment.log" - - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" - - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/coverage/" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/CMakeFiles/rules.ninja" reports: junit: "${CI_PROJECT_DIR}/*junit.*xml" extends: .build-and-test-base diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index 18d8a1ec215..5ff8b41c435 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -1097,9 +1097,7 @@ EOF BUILD_MODULES= if [[ -n "${MODULE_CMD}" ]]; then BUILD_MODULES=${MODULE_CMD//module load /} -# echo "BVE here is ${BUILD_MODULES}" BUILD_MODULES=${BUILD_MODULES// /;} -# echo "2nd time BVE here is ${BUILD_MODULES}" cat >> ${LBANN_SETUP_FILE}<> ${yaml} openblas: From 77ff196d0efe3f3a3cb2abcf4ff1d6dad82bcfa6 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Sat, 24 Aug 2024 11:49:42 -0700 Subject: [PATCH 157/169] Apply suggestions from code review Co-authored-by: Tom Benson --- .gitlab/build-and-test-lassen.yml | 1 - .gitlab/build-and-test-tioga.yml | 1 - .gitlab/run_unit_and_integration_tests.sh | 71 ----------------------- scripts/customize_build_env.sh | 2 - scripts/utilities.sh | 2 +- 5 files changed, 1 insertion(+), 76 deletions(-) diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index f372ea85723..9a5602db3eb 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -53,5 +53,4 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: stage: build tags: [lassen, batch] variables: - WITH_HALF: "" extends: .build-and-test diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index db8569606a0..ff066177330 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -67,5 +67,4 @@ rocm-6-2-0-distconv-tioga: tags: [tioga, batch] variables: LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pdebug" - WITH_HALF: "" extends: .build-and-test diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 555467694b6..897a95fd7f3 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -8,80 +8,10 @@ cd ${LBANN_DIR}/ci_test echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Running Integration and Unit tests" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -#echo "${PWD}/run.sh CLUSTER=${CLUSTER}" PYTHON=python3 LBANN_PYTHON=lbann_pfe.sh -# WEEKLY=0 -# while :; do -# case ${1} in -# --weekly) -# # Run all tests. This is a weekly build. -# echo "Setting WEEKLY in run.sh" -# WEEKLY=1 -# ;; -# -?*) -# # Unknown option -# echo "Unknown option (${1})" >&2 -# exit 1 -# ;; -# *) -# # Break loop if there are no more options -# break -# esac -# shift -# done - -# # Use the spack provided by the CI -# source ${HOME}/${SPACK_REPO}/share/spack/setup-env.sh - -# # "spack" is just a shell function; it may not be exported to this -# # scope. Just to be sure, reload the shell integration. -# if [ -n "${SPACK_ROOT}" ]; then -# source ${SPACK_ROOT}/share/spack/setup-env.sh -# else -# echo "Spack required. Please set SPACK_ROOT environment variable" -# exit 1 -# fi - -# SPACK_VERSION=$(spack --version | sed 's/-.*//g' | sed 's/[(].*[)]//g') -# MIN_SPACK_VERSION=0.18.0 - -# source ${LBANN_DIR}/scripts/utilities.sh - -# compare_versions ${SPACK_VERSION} ${MIN_SPACK_VERSION} -# VALID_SPACK=$? - -# if [[ ${VALID_SPACK} -eq 2 ]]; then -# echo "Newer version of Spack required. Detected version ${SPACK_VERSION} requires at least ${MIN_SPACK_VERSION}" -# exit 1 -# fi - -# echo "run.sh WEEKLY=" -# echo $WEEKLY - -# echo "Task: Cleaning" -# ./clean.sh - -# echo "Discovered installed module file: ${LBANN_MODFILES_DIR}" -# echo "Discovered Spack environment: ${SPACK_ENV_NAME}" -# echo "Task: Compiler Tests" -# cd compiler_tests -# $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml || exit 1 - -# Find the correct module to load -# SPACK_ARCH=$(spack arch) -# SPACK_ARCH_TARGET=$(spack arch -t) -# export LBANN_BUILD_LABEL="lbann_${SYSTEM_NAME}_${SPACK_ENV_NAME}-${SPACK_ARCH_TARGET}" -# export LBANN_BUILD_PARENT_DIR="${CI_PROJECT_DIR}/builds/${LBANN_BUILD_LABEL}" -# export LBANN_INSTALL_DIR="${LBANN_BUILD_PARENT_DIR}/install" -# export LBANN_MODFILES_DIR="${LBANN_INSTALL_DIR}/etc/modulefiles" -# ml use ${LBANN_MODFILES_DIR} -# ml load lbann - -# cd unit_tests -# echo "Testing $(which lbann) from $(pwd)" case "${cluster}" in pascal) @@ -132,4 +62,3 @@ $LBANN_PYTHON -m pytest -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_resu cd .. echo "Task: Finished with status ${status} and ${failed_tests} failed tests" -#exit $status diff --git a/scripts/customize_build_env.sh b/scripts/customize_build_env.sh index 71dcb011cfb..d99dcee94dd 100644 --- a/scripts/customize_build_env.sh +++ b/scripts/customize_build_env.sh @@ -134,7 +134,6 @@ set_center_specific_modules() MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv amd/5.7.1 rocm/5.7.1 cmake/3.24.2" # MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 amd/6.1.2 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-amd StdEnv rocm/6.1.2 cmake/3.24.2" -# MODULE_CMD="module load craype-x86-trento craype-network-ofi libfabric/2.1 perftools-base/23.12.0 cce/17.0.1 craype/2.7.31.11 cray-mpich/8.1.29 cray-libsci/24.03.0 PrgEnv-cray StdEnv rocm/6.0.3 cmake/3.24.2" ;; *) echo "No pre-specified modules found for this system. Make sure to setup your own" @@ -228,7 +227,6 @@ set_center_specific_spack_dependencies() CENTER_PIP_PACKAGES="${LBANN_HOME}/scripts/common_python_packages/requirements.txt ${LBANN_HOME}/ci_test/requirements.txt" ;; "zen3" | "zen4") # Tioga, RZVernal -# CENTER_COMPILER="%cce@17.0.1" CENTER_COMPILER="%rocmcc@5.7.1" CENTER_DEPENDENCIES="^cray-mpich@8.1.29 ^hip@5.7.1 ^python@3.9.12 ^hwloc@3.0.0" CENTER_BLAS_LIBRARY="blas=libsci" diff --git a/scripts/utilities.sh b/scripts/utilities.sh index 344a9547267..e3bf1a9561f 100644 --- a/scripts/utilities.sh +++ b/scripts/utilities.sh @@ -17,7 +17,7 @@ osx_realpath() { [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}" } -host_basename() { +function host_basename() { HOST=$(hostname) HOST=${HOST//[[:digit:]]/} HOST=${HOST//\-/} From a6dd4d5d428a511de399f3cfa9cb80d9a2c474cd Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 24 Aug 2024 12:19:43 -0700 Subject: [PATCH 158/169] Renamed variable AWS_OFI_RCCL_LIBRARY to AWS_OFI_RCCL_LIBDIR. --- CMakeLists.txt | 2 +- cmake/configure_files/lbann_pfe.sh.in | 6 +++--- scripts/superbuild/CMakeLists.txt | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d2cccb57a8..98dad4041e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -589,7 +589,7 @@ endif (LBANN_WITH_PYTHON_FRONTEND OR LBANN_WITH_EMBEDDED_PYTHON) if (LBANN_WITH_PYTHON_FRONTEND) set(LBANN_PFE_PYTHON_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "") set(LBANN_PFE_PYTHONPATH "${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}" CACHE STRING "") - set(AWS_OFI_RCCL_LIBRARY "$ENV{AWS_OFI_RCCL_LIBRARY}" CACHE STRING "") + set(AWS_OFI_RCCL_LIBDIR "$ENV{AWS_OFI_RCCL_LIBDIR}" CACHE STRING "") configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_pfe.sh.in" "${CMAKE_BINARY_DIR}/lbann_pfe.sh" diff --git a/cmake/configure_files/lbann_pfe.sh.in b/cmake/configure_files/lbann_pfe.sh.in index 5bcd9cf6953..d8a53b10b50 100644 --- a/cmake/configure_files/lbann_pfe.sh.in +++ b/cmake/configure_files/lbann_pfe.sh.in @@ -25,8 +25,8 @@ ################################################################################ #!/bin/sh export PYTHONPATH=@LBANN_PFE_PYTHONPATH@:${PYTHONPATH} -if [[ -e "@AWS_OFI_RCCL_LIBRARY@" ]]; then - export AWS_OFI_RCCL_LIBRARY=@AWS_OFI_RCCL_LIBRARY@ - export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBRARY@:${LD_LIBRARY_PATH} +if [[ -e "@AWS_OFI_RCCL_LIBDIR@" ]]; then + export AWS_OFI_RCCL_LIBDIR=@AWS_OFI_RCCL_LIBDIR@ + export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBDIR@:${LD_LIBRARY_PATH} fi @LBANN_PFE_PYTHON_EXECUTABLE@ "$@" diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 269e7e154a5..549add11d2d 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -154,11 +154,11 @@ if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBARY_PATH\}\n") message("-----------------------------------------------------------------\n") file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" - "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + "export AWS_OFI_RCCL_LIBDIR=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" - "export AWS_OFI_RCCL_LIBRARY=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + "export AWS_OFI_RCCL_LIBDIR=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") endif () From 0bc591c31aaacd1f64aa4c8c22ca14565ea5d17b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Sat, 24 Aug 2024 12:34:54 -0700 Subject: [PATCH 159/169] Gather the build logs for the DHA dependencies and keep them as artifacts. --- .gitlab/build-and-test-common.yml | 2 ++ .gitlab/build-and-test.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index b6d0459e866..038cd15f076 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -50,6 +50,8 @@ - "${CI_PROJECT_DIR}/ci_environment.log" - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/CMakeFiles/rules.ninja" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-deps/all_build_files.tar.gz" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-deps/all_output_logs.tar.gz" reports: junit: "${CI_PROJECT_DIR}/*junit.*xml" extends: .build-and-test-base diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 4f9f846ad6b..0f75a59de89 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -177,6 +177,7 @@ then # Uses "${cluster}", "${prefix}", and "${lbann_sb_dir}" source ${project_dir}/.gitlab/configure_deps.sh cmake --build build-deps + ninja -C build-deps gather-all # Stamp these commits cd ${build_dir}/build-deps/aluminum/src && git rev-parse HEAD > ${prefix}/al-prebuilt-hash.txt From f3880b12764a4403d6555554e9168c21d0f68276 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 26 Aug 2024 23:01:15 -0700 Subject: [PATCH 160/169] Added some cmake logic to capture the path to the python venv used during configuration. --- .gitlab/build-and-test.sh | 2 +- .gitlab/setup_env.sh | 8 ++++---- CMakeLists.txt | 12 ++++++++++++ cmake/configure_files/lbann_module.lua.in | 4 ++++ cmake/configure_files/lbann_module.tcl.in | 7 +++++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 0f75a59de89..98133157869 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -218,7 +218,7 @@ then else ninja -C build-lbann install fi - +exit echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 839e0359de8..f5d7d661cd8 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -184,13 +184,13 @@ echo "----- DISTCONV: \"${build_distconv:-""}\"" echo "----- FFT: \"${build_fft:-""}\"" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" -if [[ ! -e "${VENV_DIR}/pyvenv.cfg" ]]; then - CMD="python3 -m venv ${VENV_DIR}" +export LBANN_PYTHON_VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" +if [[ ! -e "${LBANN_PYTHON_VENV_DIR}/pyvenv.cfg" ]]; then + CMD="python3 -m venv ${LBANN_PYTHON_VENV_DIR}" echo "${CMD}" ${CMD} fi -CMD="source ${VENV_DIR}/bin/activate" +CMD="source ${LBANN_PYTHON_VENV_DIR}/bin/activate" echo "${CMD}" ${CMD} diff --git a/CMakeLists.txt b/CMakeLists.txt index 98dad4041e9..7c880d11f74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1202,6 +1202,13 @@ else () set(LBANN_EXTRA_MODULE_LOADS "") endif () +if (DEFINED ENV{LBANN_PYTHON_VENV_DIR}) + set(LBANN_PYTHON_VENV_DIR "$ENV{LBANN_PYTHON_VENV_DIR}" CACHE STRING "") +else () + set(LBANN_PYTHON_VENV_DIR "" CACHE STRING "") +endif() +message("BVE I have found a variable >${LBANN_PYTHON_VENV_DIR}<") + configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in" "${CMAKE_BINARY_DIR}/lbann_module.lua.install" @@ -1216,10 +1223,15 @@ install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install" install(FILES "${CMAKE_BINARY_DIR}/lbann_module.tcl.install" RENAME "${LBANN_VERSION}" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") +message("BVE ready to install with VENV variable >${LBANN_PYTHON_VENV_DIR}<") +message("BVE ready to install with VENV variable >$ENV{LBANN_PYTHON_VENV_DIR}<") install(CODE [===[ message("\n********************************************************************************") message("LBANN is installed into ${CMAKE_INSTALL_PREFIX} - to load the LBANN module:") message(" module use ${CMAKE_INSTALL_PREFIX}/etc/modulefiles") message(" module load lbann") + message(" source ${LBANN_PYTHON_VENV_DIR}/bin/activate") + message(" source $ENV{LBANN_PYTHON_VENV_DIR}/bin/activate") + message(" source \${LBANN_PYTHON_VENV_DIR}/bin/activate") message("********************************************************************************") ]===]) diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 88a9846aa81..a1c7744f5f0 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -90,6 +90,10 @@ prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") +pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") +pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") +prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") @LBANN_EXTRA_MODULEPATH@ diff --git a/cmake/configure_files/lbann_module.tcl.in b/cmake/configure_files/lbann_module.tcl.in index e515c2c00b8..cd50424b550 100644 --- a/cmake/configure_files/lbann_module.tcl.in +++ b/cmake/configure_files/lbann_module.tcl.in @@ -81,3 +81,10 @@ Configuration: prepend-path PATH $root/@CMAKE_INSTALL_BINDIR@ prepend-path PYTHONPATH @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ +prepend-path LBANN_PYTHON_SITE_PACKAGES @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ + +pushenv LBANN_PYTHON_VENV_DIR @LBANN_PYTHON_VENV_DIR@ +pushenv LBANN_PYTHON_VENV_SITE_PACKAGES @LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@ +prepend-path PYTHONPATH @LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@ + +pushenv LBANN_DIR @CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@ From 6a62aa5e62fc67ab372c717f75dd9dbae74a6c01 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 06:41:08 -0700 Subject: [PATCH 161/169] Removed bad debug statement. --- .gitlab/build-and-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 98133157869..0f75a59de89 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -218,7 +218,7 @@ then else ninja -C build-lbann install fi -exit + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" From c47b429ec114539dfddf0bced5ef665525024b08 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 11:15:30 -0700 Subject: [PATCH 162/169] If a python virtual enviornment was defined and used during the build time, the Lua module file will now activate it when loaded. Removed the TCL module file since it wasn't being used by systems. Added a prompt name to the python venv. Fixed an empty variable field in the Lassen gitlab code that deleted other variables. --- .gitlab/build-and-test-lassen.yml | 1 - .gitlab/setup_env.sh | 2 +- CMakeLists.txt | 33 ++++----- cmake/configure_files/lbann_module.lua.in | 10 ++- cmake/configure_files/lbann_module.tcl.in | 90 ----------------------- 5 files changed, 22 insertions(+), 114 deletions(-) delete mode 100644 cmake/configure_files/lbann_module.tcl.in diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index 9a5602db3eb..d76c0af97e2 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -52,5 +52,4 @@ clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: .build-and-test-on-lassen: stage: build tags: [lassen, batch] - variables: extends: .build-and-test diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index f5d7d661cd8..f3cde94b44f 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -186,7 +186,7 @@ echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" export LBANN_PYTHON_VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" if [[ ! -e "${LBANN_PYTHON_VENV_DIR}/pyvenv.cfg" ]]; then - CMD="python3 -m venv ${LBANN_PYTHON_VENV_DIR}" + CMD="python3 -m venv --prompt lbann-venv ${LBANN_PYTHON_VENV_DIR}" echo "${CMD}" ${CMD} fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c880d11f74..3030cde812d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1207,31 +1207,26 @@ if (DEFINED ENV{LBANN_PYTHON_VENV_DIR}) else () set(LBANN_PYTHON_VENV_DIR "" CACHE STRING "") endif() -message("BVE I have found a variable >${LBANN_PYTHON_VENV_DIR}<") configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in" "${CMAKE_BINARY_DIR}/lbann_module.lua.install" @ONLY) -configure_file( - "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.tcl.in" - "${CMAKE_BINARY_DIR}/lbann_module.tcl.install") install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install" RENAME "${LBANN_MODULEFILE_NAME}" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") -install(FILES "${CMAKE_BINARY_DIR}/lbann_module.tcl.install" - RENAME "${LBANN_VERSION}" - DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") -message("BVE ready to install with VENV variable >${LBANN_PYTHON_VENV_DIR}<") -message("BVE ready to install with VENV variable >$ENV{LBANN_PYTHON_VENV_DIR}<") -install(CODE [===[ - message("\n********************************************************************************") - message("LBANN is installed into ${CMAKE_INSTALL_PREFIX} - to load the LBANN module:") - message(" module use ${CMAKE_INSTALL_PREFIX}/etc/modulefiles") - message(" module load lbann") - message(" source ${LBANN_PYTHON_VENV_DIR}/bin/activate") - message(" source $ENV{LBANN_PYTHON_VENV_DIR}/bin/activate") - message(" source \${LBANN_PYTHON_VENV_DIR}/bin/activate") - message("********************************************************************************") -]===]) + +if (DEFINED ENV{LBANN_PYTHON_VENV_DIR}) +set(_PYTHON_VENV_INSTALL_MSG +"\nA Python virtual environment was used to build and install LBANN. + Loading the module will activate the environment in: ${LBANN_PYTHON_VENV_DIR}") +endif () + +install(CODE " + message(\"\n********************************************************************************\") + message(\"LBANN is installed into ${CMAKE_INSTALL_PREFIX} - to load the LBANN module:\") + message(\" module use ${CMAKE_INSTALL_PREFIX}/etc/modulefiles\") + message(\" module load lbann ${_PYTHON_VENV_INSTALL_MSG}\") + message(\"********************************************************************************\") +") diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index a1c7744f5f0..c9162535aa9 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -90,9 +90,13 @@ prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") -pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") -pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") -prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") +if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then + pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") + pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + execute {cmd="source @LBANN_PYTHON_VENV_DIR@/bin/activate", modeA={"load"}} + execute {cmd="deactivate", modeA={"unload"}} +end pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") diff --git a/cmake/configure_files/lbann_module.tcl.in b/cmake/configure_files/lbann_module.tcl.in deleted file mode 100644 index cd50424b550..00000000000 --- a/cmake/configure_files/lbann_module.tcl.in +++ /dev/null @@ -1,90 +0,0 @@ -#%Module - -################################################################################ -## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Lua (and hence LMod) should be preferred, but this will -# satisfy... less modern system needs. - -set name lbann -set version @LBANN_VERSION@ -set root @CMAKE_INSTALL_PREFIX@ - -conflict $name - -set fullname LBANN -set url https://github.com/llnl/lbann -set docs https://lbann.readthedocs.io - -set description "LBANN: Livermore Big Artificial Neural Network Toolkit." - -proc ModulesHelp { } { - global description url docs - puts stderr "Description - $description" - puts stderr - puts stderr "Docs - $url" -} - -module-whatis "Package: LBANN -Version: @LBANN_VERSION@ -Description: Livermore Big Artificial Neural Network Toolkit. - A distributed memory, HPC-optimized, model and data parallel - training toolkit for deep neural networks. -URL: https://github.com/llnl/lbann -Configuration: - CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@ - CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@ - CXX Compiler: @CMAKE_CXX_COMPILER@ - CXX FLAGS: @CMAKE_CXX_FLAGS@ - CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@ - CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@ - CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@ - LBANN_GNU_LINUX: @LBANN_GNU_LINUX@ - LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@ - LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@ - LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@ - LBANN_HAS_CUDA: @LBANN_HAS_CUDA@ - LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@ - LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@ - LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@ - LBANN_HAS_CNPY: @LBANN_HAS_CNPY@ - LBANN_HAS_TBINF: @LBANN_HAS_TBINF@ - LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@ - LBANN_NVPROF: @LBANN_NVPROF@ - LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ - LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ - LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ - LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@" - -prepend-path PATH $root/@CMAKE_INSTALL_BINDIR@ -prepend-path PYTHONPATH @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ -prepend-path LBANN_PYTHON_SITE_PACKAGES @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ - -pushenv LBANN_PYTHON_VENV_DIR @LBANN_PYTHON_VENV_DIR@ -pushenv LBANN_PYTHON_VENV_SITE_PACKAGES @LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@ -prepend-path PYTHONPATH @LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@ - -pushenv LBANN_DIR @CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@ From bf3f0f3da5ebf722ae94714bae8f3fe06c02617e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 21:00:43 -0700 Subject: [PATCH 163/169] Trying to fix a bug where lbann_pfe.sh isn't found after loading the module. --- .gitlab/build-and-test.sh | 3 +++ .gitlab/run_unit_and_integration_tests.sh | 5 ++++- cmake/configure_files/lbann_module.lua.in | 10 +++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 0f75a59de89..75644824b07 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -232,10 +232,13 @@ ${CMD} LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles ml use ${LBANN_MODFILES_DIR} ml load lbann +ml show lbann echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Testing LBANN: $(which lbann)" echo "~~~~~ $(date)" +echo "----- PATH: ${PATH}" +echo "----- lbann_pfe.sh: $(which lbann_pfe.sh)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" failed_tests=0 diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh index 897a95fd7f3..e5e318997b1 100644 --- a/.gitlab/run_unit_and_integration_tests.sh +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -7,12 +7,15 @@ cd ${LBANN_DIR}/ci_test echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Running Integration and Unit tests" +echo "~~~~~ lbann: $(which lbann)" +echo "~~~~~ $(date)" +echo "----- PATH: ${PATH}" +echo "----- lbann_pfe.sh: $(which lbann_pfe.sh)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" PYTHON=python3 LBANN_PYTHON=lbann_pfe.sh - case "${cluster}" in pascal) export OMPI_MCA_mpi_warn_on_fork=0 diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index c9162535aa9..81aac0b40f8 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -85,11 +85,6 @@ whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") -prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") -prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") -prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") -prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") - if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") @@ -98,6 +93,11 @@ if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then execute {cmd="deactivate", modeA={"unload"}} end +prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") +prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") +prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") +prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") + pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") @LBANN_EXTRA_MODULEPATH@ From 901dcae04aff534ba9ca015a19aa87b6556d7b56 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 21:36:53 -0700 Subject: [PATCH 164/169] Temporarily remove the lua code to activate the virtual environment. --- cmake/configure_files/lbann_module.lua.in | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 81aac0b40f8..88a9846aa81 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -85,14 +85,6 @@ whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") -if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then - pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") - pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") - prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") - execute {cmd="source @LBANN_PYTHON_VENV_DIR@/bin/activate", modeA={"load"}} - execute {cmd="deactivate", modeA={"unload"}} -end - prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") From 10434e7ac3059a0bd4e444e095a0e1d72ac64c5c Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 21:59:53 -0700 Subject: [PATCH 165/169] Debugging modules. --- .gitlab/build-and-test.sh | 10 +++++++++- cmake/configure_files/lbann_module.lua.in | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 75644824b07..a2c1e09181a 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -230,9 +230,17 @@ echo ${CMD} ${CMD} LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles -ml use ${LBANN_MODFILES_DIR} +CMD="ml use ${LBANN_MODFILES_DIR}" +echo ${CMD} +${CMD} ml load lbann ml show lbann +CMD="ml list" +echo ${CMD} +${CMD} + +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" +echo "LBANN_DIR: ${LBANN_DIR}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Testing LBANN: $(which lbann)" diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 88a9846aa81..81aac0b40f8 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -85,6 +85,14 @@ whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") +if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then + pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") + pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + execute {cmd="source @LBANN_PYTHON_VENV_DIR@/bin/activate", modeA={"load"}} + execute {cmd="deactivate", modeA={"unload"}} +end + prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") From 6068ad1f0d3e13df6256e20ec208186ed00b985b Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 22:22:02 -0700 Subject: [PATCH 166/169] Disabled always rebuilding the dependencies. Added a check to deactivate an active environment before loading the LBANN module. --- .gitlab/build-and-test-common.yml | 1 - .gitlab/build-and-test.sh | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml index 038cd15f076..140c90b9c73 100644 --- a/.gitlab/build-and-test-common.yml +++ b/.gitlab/build-and-test-common.yml @@ -32,7 +32,6 @@ LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 5 - REBUILD_DEPS: 1 script: - printenv > ${CI_PROJECT_DIR}/ci_environment.log - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index a2c1e09181a..e2178c0ec5b 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -229,6 +229,22 @@ CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann echo ${CMD} ${CMD} +if [[ $(env | grep VIRTUAL_ENV) ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Build Script is currently in a Python virtual environment" + echo "~~~~~ Deactivate it before loading the LBANN module" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + deactivate +fi +if [[ $(env | grep VIRTUAL_ENV) ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Build Script is currently in a Python virtual environment - deactivate it before loading" + echo "~~~~~ Deactivate it before loading the LBANN module" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +fi + LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles CMD="ml use ${LBANN_MODFILES_DIR}" echo ${CMD} From 53fdf98e8160ba5228d16d65da9229e4d660343e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 27 Aug 2024 23:47:44 -0700 Subject: [PATCH 167/169] Removed debugging code. --- .gitlab/build-and-test.sh | 19 +------------------ .gitlab/configure_deps.sh | 7 ------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index e2178c0ec5b..530595a5700 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -224,8 +224,7 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -#CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann pytest protobuf tqdm numpy" +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann pytest protobuf tqdm numpy scipy" echo ${CMD} ${CMD} @@ -237,32 +236,16 @@ if [[ $(env | grep VIRTUAL_ENV) ]]; then echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" deactivate fi -if [[ $(env | grep VIRTUAL_ENV) ]]; then - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Build Script is currently in a Python virtual environment - deactivate it before loading" - echo "~~~~~ Deactivate it before loading the LBANN module" - echo "~~~~~ $(date)" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -fi LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles CMD="ml use ${LBANN_MODFILES_DIR}" echo ${CMD} ${CMD} ml load lbann -ml show lbann -CMD="ml list" -echo ${CMD} -${CMD} - -echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" -echo "LBANN_DIR: ${LBANN_DIR}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "~~~~~ Testing LBANN: $(which lbann)" echo "~~~~~ $(date)" -echo "----- PATH: ${PATH}" -echo "----- lbann_pfe.sh: $(which lbann_pfe.sh)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" failed_tests=0 diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh index 4d01678f335..ed32b90df44 100644 --- a/.gitlab/configure_deps.sh +++ b/.gitlab/configure_deps.sh @@ -7,13 +7,6 @@ else dihydrogen_lapack_opt="" fi -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "----- BVE Dependency Flags:" -echo "----- HALF: ${build_half}" -echo "----- DISTCONV: ${build_distconv}" -echo "----- FFT: ${build_fft}" -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - cmake \ -G Ninja \ -S ${lbann_sb_dir} \ From b2d573e5255dae02a548cf472268b5ad2ad566b8 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 28 Aug 2024 13:53:11 -0700 Subject: [PATCH 168/169] Updated the Tioga tests to use ROCm 6.2.1beta1 and craycc. --- .gitlab/build-and-test-tioga.yml | 34 +++++++++---------- .../test_integration_resnet50.py | 2 ++ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index ff066177330..3ae30edef40 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -35,30 +35,30 @@ stages: include: local: "/.gitlab/build-and-test-common.yml" -rocm-5-7-1-tioga: - variables: - COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" - extends: .build-and-test-on-tioga +# rocm-5-7-1-tioga: +# variables: +# COMPILER_FAMILY: amdclang +# MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" +# extends: .build-and-test-on-tioga -rocm-5-7-1-distconv-tioga: - variables: - COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" - WITH_DISTCONV: "ON" - extends: .build-and-test-on-tioga +# rocm-5-7-1-distconv-tioga: +# variables: +# COMPILER_FAMILY: amdclang +# MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" +# WITH_DISTCONV: "ON" +# extends: .build-and-test-on-tioga -rocm-6-2-0-tioga: +rocm-6-2-1beta1-tioga: variables: - COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + COMPILER_FAMILY: craycc + MODULES: "PrgEnv-cray rocm/6.2.1beta1 cray-mpich/8.1.30 cray-libsci" extends: .build-and-test-on-tioga -rocm-6-2-0-distconv-tioga: +rocm-6-2-1beta1-distconv-tioga: variables: - COMPILER_FAMILY: amdclang - MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + COMPILER_FAMILY: craycc + MODULES: "PrgEnv-cray rocm/6.2.1beta1 cray-mpich/8.1.30 cray-libsci" WITH_DISTCONV: "ON" extends: .build-and-test-on-tioga diff --git a/ci_test/integration_tests/test_integration_resnet50.py b/ci_test/integration_tests/test_integration_resnet50.py index f1e4e6dd21c..81ff05826ff 100644 --- a/ci_test/integration_tests/test_integration_resnet50.py +++ b/ci_test/integration_tests/test_integration_resnet50.py @@ -54,6 +54,8 @@ 'mini_batch_size': 256, 'expected_train_accuracy_range': (2.75, 4.25), # Decreased lower limit from 3.0 to 2.75 due to variance 'expected_test_accuracy_range': (1.5, 2.11), # BVE increased upper limit from 2.1 10/28 +# 2.144 - Pascal + # 1.446 - Corona 'fraction_of_data_to_use': imagenet_fraction * 0.01, 'expected_mini_batch_times': { 'pascal': 0.43, From 309af7cada76e2083f669fe345c3bd1573b8de47 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 28 Aug 2024 15:13:45 -0700 Subject: [PATCH 169/169] Rewound the Tioga ROCm versions. --- .gitlab/build-and-test-tioga.yml | 35 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml index 3ae30edef40..f61553abd5f 100644 --- a/.gitlab/build-and-test-tioga.yml +++ b/.gitlab/build-and-test-tioga.yml @@ -35,30 +35,29 @@ stages: include: local: "/.gitlab/build-and-test-common.yml" -# rocm-5-7-1-tioga: -# variables: -# COMPILER_FAMILY: amdclang -# MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" -# extends: .build-and-test-on-tioga - -# rocm-5-7-1-distconv-tioga: -# variables: -# COMPILER_FAMILY: amdclang -# MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" -# WITH_DISTCONV: "ON" -# extends: .build-and-test-on-tioga +rocm-5-7-1-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" + extends: .build-and-test-on-tioga +rocm-5-7-1-distconv-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" + WITH_DISTCONV: "ON" + extends: .build-and-test-on-tioga -rocm-6-2-1beta1-tioga: +rocm-6-2-0-tioga: variables: - COMPILER_FAMILY: craycc - MODULES: "PrgEnv-cray rocm/6.2.1beta1 cray-mpich/8.1.30 cray-libsci" + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" extends: .build-and-test-on-tioga -rocm-6-2-1beta1-distconv-tioga: +rocm-6-2-0-distconv-tioga: variables: - COMPILER_FAMILY: craycc - MODULES: "PrgEnv-cray rocm/6.2.1beta1 cray-mpich/8.1.30 cray-libsci" + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" WITH_DISTCONV: "ON" extends: .build-and-test-on-tioga