diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8d555aa6b68..fa7d2430360 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -28,110 +28,41 @@ # clusters. To run testing locally, consult the README in the ci_test # directory. -variables: - FF_USE_NEW_BASH_EVAL_STRATEGY: 'true' - FF_ENABLE_BASH_EXIT_CODE_CHECK: 1 - LBANN_CI_CLEAN_BUILD: 'true' +include: + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' stages: - run-all-clusters -corona testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml - -corona distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" - SPACK_SPECS: "+rocm +distconv" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/corona/pipeline.yml - -lassen testing: - stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/lassen/pipeline.yml - -lassen distconv testing: +tioga testing: stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-multi-stage-distconv" - SPACK_SPECS: "+cuda +distconv +fft" -# SPACK_SPECS: "+cuda +distconv +nvshmem +fft" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" trigger: strategy: depend - include: .gitlab/lassen/multi_stage_pipeline.yml + include: '.gitlab/build-and-test-tioga.yml' + forward: + pipeline_variables: true pascal testing: stage: run-all-clusters - variables: - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" trigger: strategy: depend - include: .gitlab/pascal/pipeline.yml + include: '.gitlab/build-and-test-pascal.yml' + forward: + pipeline_variables: true -pascal compiler testing: - stage: run-all-clusters - variables: - SPACK_SPECS: "%gcc@10.3.1 +cuda +half +fft" - BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - trigger: - strategy: depend - include: .gitlab/pascal/pipeline_compiler_tests.yml - -pascal distconv testing: - stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_SPECS: "%gcc@10.3.1 +cuda +distconv +fft" - BUILD_SCRIPT_OPTIONS: "--no-default-mirrors" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" - trigger: - strategy: depend - include: .gitlab/pascal/pipeline.yml - -tioga testing: +corona testing: stage: run-all-clusters - variables: - # FF_USE_NEW_BASH_EVAL_STRATEGY: 1 - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" trigger: strategy: depend - include: .gitlab/tioga/pipeline.yml + include: '.gitlab/build-and-test-corona.yml' + forward: + pipeline_variables: true -tioga distconv testing: +lassen testing: stage: run-all-clusters - variables: - JOB_NAME_SUFFIX: _distconv - SPACK_ENV_BASE_NAME_MODIFIER: "-distconv" - SPACK_SPECS: "+rocm +distconv" - WITH_WEEKLY: "${LBANN_CI_RUN_WEEKLY}" - WITH_CLEAN_BUILD: "${LBANN_CI_CLEAN_BUILD}" - TEST_FLAG: "test_*_distconv.py" trigger: strategy: depend - include: .gitlab/tioga/pipeline.yml + include: '.gitlab/build-and-test-lassen.yml' + forward: + pipeline_variables: true diff --git a/.gitlab/build-and-test-common.yml b/.gitlab/build-and-test-common.yml new file mode 100644 index 00000000000..140c90b9c73 --- /dev/null +++ b/.gitlab/build-and-test-common.yml @@ -0,0 +1,56 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +.build-and-test-base: + variables: + LLNL_SERVICE_USER: lbannusr + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N2 -t 90" + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120m" + LLNL_LSF_SCHEDULER_PARAMETERS: "-q pbatch -nnodes 2 -W 60" + GIT_SUBMODULE_STRATEGY: none + GIT_DEPTH: 5 + script: + - printenv > ${CI_PROJECT_DIR}/ci_environment.log + - ${CI_PROJECT_DIR}/.gitlab/build-and-test.sh + cache: + key: $CI_JOB_NAME_SLUG + paths: + - install-deps-${CI_JOB_NAME_SLUG} + timeout: 6h + +.build-and-test: + artifacts: + when: always + paths: + - "${CI_PROJECT_DIR}/*junit.*xml" + - "${CI_PROJECT_DIR}/ci_environment.log" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/build.ninja" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-lbann/CMakeFiles/rules.ninja" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-deps/all_build_files.tar.gz" + - "${CI_PROJECT_DIR}/build-${CI_JOB_ID}/build-deps/all_output_logs.tar.gz" + reports: + junit: "${CI_PROJECT_DIR}/*junit.*xml" + extends: .build-and-test-base diff --git a/.gitlab/build-and-test-corona.yml b/.gitlab/build-and-test-corona.yml new file mode 100644 index 00000000000..a9bfa269783 --- /dev/null +++ b/.gitlab/build-and-test-corona.yml @@ -0,0 +1,54 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +rocm-5-7-1-corona: + variables: + COMPILER_FAMILY: amdclang + MODULES: "rocm/5.7.1 clang/14.0.6-magic openmpi/4.1.2" + extends: .build-and-test-on-corona + +rocm-5-7-1-distconv-corona: + variables: + COMPILER_FAMILY: amdclang + MODULES: "rocm/5.7.1 clang/14.0.6-magic openmpi/4.1.2" + WITH_DISTCONV: "ON" + extends: .build-and-test-on-corona + +.build-and-test-on-corona: + stage: build + tags: [corona, batch] + extends: .build-and-test diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml new file mode 100644 index 00000000000..d76c0af97e2 --- /dev/null +++ b/.gitlab/build-and-test-lassen.yml @@ -0,0 +1,55 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +# fftw/3.3.10-gcc-11.2.1 +clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + extends: .build-and-test-on-lassen + +clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + WITH_DISTCONV: "ON" + extends: .build-and-test-on-lassen + +.build-and-test-on-lassen: + stage: build + tags: [lassen, batch] + extends: .build-and-test diff --git a/.gitlab/build-and-test-pascal.yml b/.gitlab/build-and-test-pascal.yml new file mode 100644 index 00000000000..5047a3a299a --- /dev/null +++ b/.gitlab/build-and-test-pascal.yml @@ -0,0 +1,54 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +clang-14-0-6-cuda-11-8-0-pascal: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" + WITH_HALF: "ON" + extends: [.build-and-test-on-pascal, .build-and-test] + +clang-14-0-6-cuda-11-8-0-distconv-pascal: + variables: + COMPILER_FAMILY: clang + MODULES: "clang/14.0.6-magic openmpi/4.1.2 cuda/11.8.0 ninja/1.11.1" + WITH_DISTCONV: "ON" + extends: [.build-and-test-on-pascal, .build-and-test] + +.build-and-test-on-pascal: + stage: build + tags: [pascal, batch] diff --git a/.gitlab/build-and-test-tioga.yml b/.gitlab/build-and-test-tioga.yml new file mode 100644 index 00000000000..f61553abd5f --- /dev/null +++ b/.gitlab/build-and-test-tioga.yml @@ -0,0 +1,69 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +default: + id_tokens: + SITE_ID_TOKEN: + aud: https://lc.llnl.gov/gitlab + +stages: + - build + +include: + local: "/.gitlab/build-and-test-common.yml" + +rocm-5-7-1-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" + extends: .build-and-test-on-tioga + +rocm-5-7-1-distconv-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/5.7.1 cray-mpich/8.1.30 cray-libsci" + WITH_DISTCONV: "ON" + extends: .build-and-test-on-tioga + +rocm-6-2-0-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + extends: .build-and-test-on-tioga + +rocm-6-2-0-distconv-tioga: + variables: + COMPILER_FAMILY: amdclang + MODULES: "PrgEnv-cray rocm/6.2.0 cray-mpich/8.1.30 cray-libsci" + WITH_DISTCONV: "ON" + extends: .build-and-test-on-tioga + +.build-and-test-on-tioga: + stage: build + tags: [tioga, batch] + variables: + LLNL_FLUX_SCHEDULER_PARAMETERS: "-N2 -t 120 -q pdebug" + extends: .build-and-test diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh new file mode 100755 index 00000000000..530595a5700 --- /dev/null +++ b/.gitlab/build-and-test.sh @@ -0,0 +1,266 @@ +#!/usr/bin/env bash + +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Initialize modules for users not using bash as a default shell +modules_home=${MODULESHOME:-"/usr/share/lmod/lmod"} +if [[ -e ${modules_home}/init/bash ]] +then + source ${modules_home}/init/bash +fi + +set -o errexit +set -o nounset + +hostname="$(hostname)" +cluster=${hostname//[0-9]/} +project_dir="$(git rev-parse --show-toplevel)" +if [[ $? -eq 1 ]] +then + project_dir="$(pwd)" +fi + +# NOTE: No modules will be explicitly unloaded or purged. Obviously, +# loading a new compiler will trigger the auto-unload of the existing +# compiler module (and all the other side-effects wrt mpi, etc), but +# no explicit action is taken by this script. +modules=${MODULES:-""} +run_coverage=${WITH_COVERAGE:-""} +build_distconv=${WITH_DISTCONV:-""} +build_half=${WITH_HALF:-""} +build_fft=${WITH_FFT:-""} + +TEST_FLAG=${WITH_DISTCONV:-""} +if [[ ${build_distconv} ]]; then + TEST_FLAG="test_*_distconv.py" +fi + +job_unique_id=${CI_JOB_ID:-""} +prefix="" + +# Setup the module environment +if [[ -n "${modules}" ]] +then + echo "Loading modules: \"${modules}\"" + module load ${modules} +fi + +# Finish setting up the environment +source ${project_dir}/.gitlab/setup_env.sh + +# Make sure our working directory is something sane. +cd ${project_dir} + +# Create some temporary build space. +if [[ -z "${job_unique_id}" ]]; then + job_unique_id=manual_job_$(date +%F_%0H%0M) + while [[ -d ${prefix}-${job_unique_id} ]] ; do + sleep 1 + job_unique_id=manual_job_$(date +%F_%0H%0M) + done +fi +build_dir=${BUILD_DIR:-"${project_dir}/build-${job_unique_id}"} +mkdir -p ${build_dir} + +# Dependencies +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Build and test started" +echo "~~~~~ Start: $(date)" +echo "~~~~~ Host: ${hostname}" +echo "~~~~~ Project dir: ${project_dir}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" +dha_prefix=${prefix} + +# Just for good measure... +export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +# Allow a user to force this +rebuild_deps=${REBUILD_DEPS:-""} + +# Rebuild if the prefix doesn't exist. +if [[ ! -d "${prefix}" ]] +then + rebuild_deps=1 +fi + +# Rebuild if latest hashes don't match +if [[ -z "${rebuild_deps}" ]] +then + function fetch-sha { + # $1 is the LLNL package name (e.g., 'aluminum') + # $2 is the branch name (e.g., 'master') + curl -s -H "Accept: application/vnd.github.VERSION.sha" \ + "https://api.github.com/repos/llnl/$1/commits/$2" + } + + al_head=$(fetch-sha aluminum master) + al_prebuilt="" + if [[ -f "${prefix}/al-prebuilt-hash.txt" ]] + then + al_prebuilt=$(cat ${prefix}/al-prebuilt-hash.txt) + fi + + h_head=$(fetch-sha elemental hydrogen) + h_prebuilt="" + if [[ -f "${prefix}/h-prebuilt-hash.txt" ]] + then + h_prebuilt=$(cat ${prefix}/h-prebuilt-hash.txt) + fi + + h2_head=$(fetch-sha dihydrogen develop) + h2_prebuilt="" + if [[ -f "${prefix}/h2-prebuilt-hash.txt" ]] + then + h2_prebuilt=$(cat ${prefix}/h2-prebuilt-hash.txt) + fi + + if [[ "${al_head}" != "${al_prebuilt}" ]] + then + echo "Prebuilt Aluminum hash does not match latest head; rebuilding." + echo " (prebuilt: ${al_prebuilt}; head: ${al_head})" + rebuild_deps=1 + fi + if [[ "${h_head}" != "${h_prebuilt}" ]] + then + echo "Prebuilt Hydrogen hash does not match latest head; rebuilding." + echo " (prebuilt: ${h_prebuilt}; head: ${h_head})" + rebuild_deps=1 + fi + if [[ "${h2_head}" != "${h2_prebuilt}" ]] + then + echo "Prebuilt DiHydrogen hash does not match latest head; rebuilding." + echo " (prebuilt: ${h2_prebuilt}; head: ${h2_head})" + rebuild_deps=1 + fi +fi + +if [[ -n "${rebuild_deps}" ]] +then + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Building Dependencies" + echo "~~~~~ Build dir: ${build_dir}" + echo "~~~~~ Install dir: ${prefix}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + # Set the superbuild dir + lbann_sb_dir=${project_dir}/scripts/superbuild + + cd ${build_dir} + # Uses "${cluster}", "${prefix}", and "${lbann_sb_dir}" + source ${project_dir}/.gitlab/configure_deps.sh + cmake --build build-deps + ninja -C build-deps gather-all + + # Stamp these commits + cd ${build_dir}/build-deps/aluminum/src && git rev-parse HEAD > ${prefix}/al-prebuilt-hash.txt + cd ${build_dir}/build-deps/hydrogen/src && git rev-parse HEAD > ${prefix}/h-prebuilt-hash.txt + cd ${build_dir}/build-deps/dihydrogen/src && git rev-parse HEAD > ${prefix}/h2-prebuilt-hash.txt + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Dependencies Built" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +else + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Using Cached Dependencies" + echo "~~~~~ Prefix: ${prefix}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + for f in $(find ${prefix} -iname "*.pc"); + do + pfx=$(realpath $(dirname $(dirname $(dirname $f)))) + echo " >> Changing prefix in $(realpath $f) to: ${pfx}" + sed -i -e "s|^prefix=.*|prefix=${pfx}|g" $f + done +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Building LBANN" +echo "~~~~~ $(date)" +echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +prefix=${build_dir}/install +cd ${build_dir} +source ${project_dir}/.gitlab/configure_lbann.sh +if ! cmake --build build-lbann ; +then + echo "ERROR: compilation failed, building with verbose output..." + cmake --build build-lbann --verbose -j 1 +else + ninja -C build-lbann install +fi + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Installing Python Packages with PIP" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann pytest protobuf tqdm numpy scipy" +echo ${CMD} +${CMD} + +if [[ $(env | grep VIRTUAL_ENV) ]]; then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Build Script is currently in a Python virtual environment" + echo "~~~~~ Deactivate it before loading the LBANN module" + echo "~~~~~ $(date)" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + deactivate +fi + +LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles +CMD="ml use ${LBANN_MODFILES_DIR}" +echo ${CMD} +${CMD} +ml load lbann + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Testing LBANN: $(which lbann)" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +failed_tests=0 +source ${project_dir}/.gitlab/run_catch_tests.sh + +source ${project_dir}/.gitlab/run_unit_and_integration_tests.sh + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ LBANN Tests Complete" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Build and test completed" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +[[ "${failed_tests}" -eq 0 ]] && exit 0 || exit 1 diff --git a/.gitlab/configure_deps.sh b/.gitlab/configure_deps.sh new file mode 100644 index 00000000000..ed32b90df44 --- /dev/null +++ b/.gitlab/configure_deps.sh @@ -0,0 +1,74 @@ +if [[ "$cluster" == "lassen" ]] +then + hydrogen_lapack_opt="-D LBANN_SB_FWD_Hydrogen_BLA_VENDOR=Generic" + dihydrogen_lapack_opt="-D LBANN_SB_FWD_DiHydrogen_BLA_VENDOR=Generic" +else + hydrogen_lapack_opt="" + dihydrogen_lapack_opt="" +fi + +cmake \ + -G Ninja \ + -S ${lbann_sb_dir} \ + -B ${build_dir}/build-deps \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${prefix} \ + \ + -D CMAKE_EXE_LINKER_FLAGS=${common_linker_flags} \ + -D CMAKE_SHARED_LINKER_FLAGS=${common_linker_flags} \ + \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/|}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/|}" \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ + -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_SKIP_BUILD_RPATH=OFF \ + -D CMAKE_SKIP_INSTALL_RPATH=OFF \ + -D CMAKE_SKIP_RPATH=OFF \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ + \ + -D LBANN_SB_BUILD_Aluminum=ON \ + -D LBANN_SB_Aluminum_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Aluminum_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_CALIPER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_NCCL=ON \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_HOST_TRANSFER=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_TESTS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_BENCHMARKS=OFF \ + -D LBANN_SB_FWD_Aluminum_ALUMINUM_ENABLE_THREAD_MULTIPLE=OFF \ + -D LBANN_SB_FWD_Aluminum_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_Hydrogen=ON \ + ${hydrogen_lapack_opt} \ + -D LBANN_SB_Hydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_Hydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_HALF=${build_half:-OFF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_GPU_FP16=${build_half:-OFF} \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_TESTING=ON \ + -D LBANN_SB_FWD_Hydrogen_Hydrogen_ENABLE_UNIT_TESTS=OFF \ + -D LBANN_SB_FWD_Hydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} \ + \ + -D LBANN_SB_BUILD_DiHydrogen=ON \ + ${dihydrogen_lapack_opt} \ + -D LBANN_SB_DiHydrogen_TAG=fix-rocm-6-2-0-build \ + -D LBANN_SB_DiHydrogen_URL=https://github.com/benson31/dihydrogen \ + -D LBANN_SB_DiHydrogen_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D LBANN_SB_DiHydrogen_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + -D LBANN_SB_FWD_DiHydrogen_H2_ENABLE_DISTCONV_LEGACY=${build_distconv:-OFF} \ + -D LBANN_SB_FWD_DiHydrogen_CMAKE_PREFIX_PATH=${FWD_CMAKE_PREFIX_PATH} diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh new file mode 100644 index 00000000000..80f1c7721d8 --- /dev/null +++ b/.gitlab/configure_lbann.sh @@ -0,0 +1,62 @@ +if [[ "$cluster" == "lassen" ]] +then + lbann_lapack_opt="-D BLA_VENDOR=Generic" + build_fft=OFF +else + lbann_lapack_opt="" +fi + +if [[ "$cluster" == "tioga" ]] +then + build_fft=OFF +fi + +# Default RPATH rules will not include in-source libraries from the prefix path... add them here. +if [ -z "${extra_rpaths}" ]; then + extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64 +else + extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} +fi + +cmake -G Ninja \ + -S ${project_dir} \ + -B ${build_dir}/build-lbann \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=RelWithDebInfo \ + -D CMAKE_INSTALL_PREFIX=${prefix}/lbann \ + \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/;}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/;}" \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + -D AMDGPU_TARGETS=${gpu_arch} \ + -D GPU_TARGETS=${gpu_arch} \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -D CMAKE_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -D CMAKE_HIP_FLAGS="${EXTRA_HIP_FLAGS}" \ + ${lbann_lapack_opt} \ + -D LBANN_DATATYPE=float \ + -D LBANN_WITH_CALIPER=OFF \ + -D LBANN_WITH_DISTCONV=${build_distconv:-OFF} \ + -D LBANN_WITH_TBINF=OFF \ + -D LBANN_WITH_UNIT_TESTING=ON \ + -D LBANN_WITH_CNPY=ON \ + -D LBANN_DETERMINISTIC=ON \ + -D LBANN_WITH_ADDRESS_SANITIZER=OFF \ + -D LBANN_WITH_FFT=${build_fft:-OFF} \ + -D LBANN_WITH_EMBEDDED_PYTHON=ON \ + -D LBANN_WITH_PYTHON_FRONTEND=ON \ + -D LBANN_WITH_VISION=ON diff --git a/.gitlab/corona/pipeline.yml b/.gitlab/corona/pipeline.yml index 8458c06eea5..4820883cd62 100644 --- a/.gitlab/corona/pipeline.yml +++ b/.gitlab/corona/pipeline.yml @@ -73,8 +73,9 @@ build and install: - !reference [.setup_spack, script] - flux proxy ${JOB_ID} flux run -N 1 -t 30m ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) diff --git a/.gitlab/pascal/pipeline.yml b/.gitlab/pascal/pipeline.yml index 1a8d0e85d8d..fb0dca3eb6c 100644 --- a/.gitlab/pascal/pipeline.yml +++ b/.gitlab/pascal/pipeline.yml @@ -70,8 +70,9 @@ build and install: - !reference [.setup_spack, script] - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) @@ -177,10 +178,10 @@ release allocation: variables: # Just the obvious identifier. Which specific node doesn't matter. SYSTEM_NAME: pascal - # SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} - # SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git - SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test - SPACK_REPO: spack_repos/spack_test.git + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git # These are system-specific specs that should be forwarded to the # build script diff --git a/.gitlab/pascal/pipeline_compiler_tests.yml b/.gitlab/pascal/pipeline_compiler_tests.yml index e8633d2e3f5..99a48200f58 100644 --- a/.gitlab/pascal/pipeline_compiler_tests.yml +++ b/.gitlab/pascal/pipeline_compiler_tests.yml @@ -67,8 +67,9 @@ build and install: - !reference [.setup_spack, script] - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip ${BUILD_SCRIPT_OPTIONS} -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib - export TEST_TASKS_PER_NODE=2 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(spack arch) @@ -123,10 +124,10 @@ release allocation: variables: # Just the obvious identifier. Which specific node doesn't matter. SYSTEM_NAME: pascal - # SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} - # SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git - SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test - SPACK_REPO: spack_repos/spack_test.git + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git # Specific extensions to the SPACK_ENV_BASE_NAME SPACK_ENV_BASE_NAME_EXTENSION: -gcc diff --git a/.gitlab/pascal/single_pipeline.yml b/.gitlab/pascal/single_pipeline.yml new file mode 100644 index 00000000000..8becbd000f3 --- /dev/null +++ b/.gitlab/pascal/single_pipeline.yml @@ -0,0 +1,165 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# This is the testing pipeline for the Pascal cluster at LLNL. This +# cluster builds the LBANN applications and libraries using a single +# compiler toolchain and then runs a collection of tests. Testing +# output is in JUnit format and parsed by the pipeline for web +# viewing. + +# See the Catalyst pipeline for more thorough documentation. + +include: + - .gitlab/common/common.yml + +stages: + - allocate + - build + - test + - deallocate + +# Get LC resources. +allocate lc resources: + stage: allocate + extends: + - .pascal common + - .lbann-base-vars + variables: + GIT_STRATEGY: none + script: + - echo "== ACQUIRING SLURM RESOURCES ==" + - echo "${WITH_WEEKLY:+Running with --weekly}" + - export TEST_TIME=$([[ -n "${WITH_WEEKLY}" ]] && echo "120" || echo "90") + - export LBANN_NNODES=$([[ -n "${WITH_WEEKLY}" ]] && echo "4" || echo "2") + - salloc --exclusive -N ${LBANN_NNODES} -p pbatch -t ${TEST_TIME} --no-shell -J ${JOB_NAME} + timeout: 6h + +# Build LBANN and establish the Spack environment for this pipeline. +build and install: + extends: + - .pascal common + - .lbann-base-vars + - .lbann-artifacts + stage: build + script: + - echo "== BUILDING LBANN ==" + - export JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - !reference [.setup_spack, script] + - srun --jobid=${JOB_ID} -N 1 -t 30 ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} + -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies + --ci-pip -- + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib + - export TEST_TASKS_PER_NODE=2 + - export TEST_MPIBIND_FLAG="--mpibind=off" + - export SPACK_ARCH=$(spack arch) + - export SPACK_ARCH_TARGET=$(spack arch -t) + - !reference [.setup_lbann, script] + - .gitlab/common/run-catch-tests.sh +# + - echo "== SETUP SPACK ENVIRONMENT ==" + - export SPACK_USER_CACHE_PATH=${SPACK_USER_CACHE_PATH} + - source ${HOME}/${SPACK_REPO}/share/spack/setup-env.sh + - source spack-ci-env-name.sh + - ml use ${LBANN_MODFILES_DIR} + - ml load lbann + - echo "$(which lbann)" +# + - ([[ $(find ${RESULTS_DIR} -name "catch-tests-failed.txt" | wc -l) -eq 0 ]]) +# + - echo "== RUNNING PYTHON-BASED UNIT TESTS ==" + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - pushd ci_test/unit_tests + # - echo "Running unit tests with file pattern: ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 --junitxml=results.xml ${TEST_FLAG} + - popd +# + - echo "== RUNNING PYTHON-BASED INTEGRATION TESTS ==" + - echo "Testing $(which lbann)" + - export OMP_NUM_THREADS=10 + # - export SLURM_JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - pushd ci_test/integration_tests + - export WEEKLY_FLAG=${WITH_WEEKLY:+--weekly} + # - echo "Running integration tests with file pattern: ${TEST_FLAG}" + # - echo "lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG}" + - lbann_pfe.sh -m pytest -s -vv --durations=0 ${WEEKLY_FLAG} --junitxml=results.xml ${TEST_FLAG} + - popd + artifacts: + when: always + paths: + - ci_test/unit_tests/results.xml + - ci_test/integration_tests/results.xml + reports: + junit: ${RESULTS_DIR}/*.xml + junit: ci_test/unit_tests/results.xml + junit: ci_test/integration_tests/results.xml + +# Cleanup the pipeline's Spack environment. +# Switching over to reusing Spack environments for each feature branch so don't remove them immediately +# Cleanup any build directories and spack environments older than 5 days since last use +remove spack environment: + extends: + - .pascal common + - .lbann-base-vars + - .cleanup old spack environment + stage: deallocate + variables: + GIT_STRATEGY: none + when: always + +# Free the allocation we obtained in "allocate lc resources". +release allocation: + stage: deallocate + extends: + - .pascal common + - .lbann-base-vars + variables: + GIT_STRATEGY: none + when: always + script: + - echo "== RELEASING RESOURCES ==" + - export JOB_ID=$(squeue -h -n "${JOB_NAME}" -o "%A") + - ([[ -n "${JOB_ID}" ]] && scancel ${JOB_ID}) + +# Variables for Pascal. +.pascal common: + variables: + # Just the obvious identifier. Which specific node doesn't matter. + SYSTEM_NAME: pascal + SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_${SYSTEM_NAME} + SPACK_REPO: spack_repos/spack_${SYSTEM_NAME}.git + #SPACK_USER_CACHE_PATH: /g/g14/lbannusr/spack_repos/.spack_test + #SPACK_REPO: spack_repos/spack_test.git + + # These are system-specific specs that should be forwarded to the + # build script + SPACK_SPECS: "+cuda +half +fft" + + tags: + - pascal + - shell diff --git a/.gitlab/run_catch_tests.sh b/.gitlab/run_catch_tests.sh new file mode 100644 index 00000000000..c5df67dd9b5 --- /dev/null +++ b/.gitlab/run_catch_tests.sh @@ -0,0 +1,79 @@ +# Run the sequential catch tests +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Sequential catch tests" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +timeout -k 1m 10m \ + ${build_dir}/build-lbann/unit_test/seq-catch-tests \ + -r console \ + -r JUnit::out=${project_dir}/seq-tests_junit.xml \ + || { + failed_tests=$(( ${failed_tests} + $? )) + echo "******************************" + echo " >>> seq-catch-tests FAILED" + echo "******************************" +} + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ MPI Catch Tests" +echo "----- LBANN output logged to: ${project_dir}/lbann-log-mpi-catch-tests.log" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +case "${cluster}" in + pascal) + export OMPI_MCA_mpi_warn_on_fork=0 + timeout -k 1m 10m \ + srun -N1 -n2 --ntasks-per-node=2 --mpibind=off \ + -D ${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests=} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + lassen) + timeout -k 1m 10m \ + jsrun -n1 -r1 -a4 -c40 -g4 -d packed -b packed:10 \ + -h ${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + corona|tioga) + export H2_SELECT_DEVICE_0=1 + timeout -k 1m 10m \ + flux run -N1 -n8 -g1 --exclusive \ + --cwd=${build_dir}/build-lbann \ + ${build_dir}/build-lbann/unit_test/mpi-catch-tests \ + -r console::out=${project_dir}/mpi-catch-tests-console-rank=%r-size=%s.log \ + -r JUnit::out=${project_dir}/mpi-catch-tests-rank=%r-size=%s_junit.xml \ + > ${project_dir}/lbann-log-mpi-catch-tests.log 2>&1 \ + || { + failed_tests=$((${failed_tests} + $?)) + echo "******************************" + echo " >>> mpi-catch-tests FAILED" + echo "******************************" + } + ;; + *) + echo "Unknown cluster: ${cluster}" + ;; +esac + +for filename in ${project_dir}/mpi-catch-tests-console-rank=*.log; do + [ -e "$filename" ] || continue + echo "$filename" + cat $filename +done diff --git a/.gitlab/run_unit_and_integration_tests.sh b/.gitlab/run_unit_and_integration_tests.sh new file mode 100644 index 00000000000..e5e318997b1 --- /dev/null +++ b/.gitlab/run_unit_and_integration_tests.sh @@ -0,0 +1,67 @@ +#!/bin/bash -l + +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +LBANN_DIR=$(git rev-parse --show-toplevel) + +cd ${LBANN_DIR}/ci_test + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Running Integration and Unit tests" +echo "~~~~~ lbann: $(which lbann)" +echo "~~~~~ $(date)" +echo "----- PATH: ${PATH}" +echo "----- lbann_pfe.sh: $(which lbann_pfe.sh)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +PYTHON=python3 +LBANN_PYTHON=lbann_pfe.sh + +case "${cluster}" in + pascal) + export OMPI_MCA_mpi_warn_on_fork=0 + ;; + lassen) + ;; + corona|tioga) + export H2_SELECT_DEVICE_0=1 + ;; + *) + echo "Unknown cluster: ${cluster}" + ;; +esac + +export OMP_NUM_THREADS=10 + +# These tests are "allowed" to fail inside the script. That is, the +# unit tests should be run even if these fail. The status is cached +# for now. +status=0 +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "Task: Integration Tests with file pattern: ${TEST_FLAG}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +cd integration_tests +$LBANN_PYTHON -m pytest -vv --durations=0 --junitxml=${LBANN_DIR}/integration_test_results_junit.xml ${TEST_FLAG} || { + this_status=$? + status=$(( $status + $this_status )) + failed_tests=$(( $failed_tests + $this_status )) + echo "******************************" + echo " >>> Integration Tests FAILED" + echo "******************************" +} +cd .. + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "Task: Unit Tests with file pattern: ${TEST_FLAG}" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +cd unit_tests +$LBANN_PYTHON -m pytest -vv --durations=0 --junitxml=${LBANN_DIR}/unit_test_results_junit.xml ${TEST_FLAG} || { + this_status=$? + status=$(( $status + $this_status )) + failed_tests=$(( $failed_tests + $this_status )) + echo "******************************" + echo " >>> Unit Tests FAILED" + echo "******************************" +} +cd .. + +echo "Task: Finished with status ${status} and ${failed_tests} failed tests" diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh new file mode 100644 index 00000000000..f3cde94b44f --- /dev/null +++ b/.gitlab/setup_env.sh @@ -0,0 +1,201 @@ +# This is a collection of common variables and whatnot that may change +# based on the value of "${cluster}" or other variables. + +# To make things work with modules, the user can set "COMPILER_FAMILY" +# to "gnu", "clang", "amdclang", or "cray" and the suitable compiler +# paths will be deduced from the current PATH. Alternatively, users +# can set "CC"/"CXX" directly, in which case the +# "COMPILER_FAMILY" variable will be ignored. + +# Prefer RPATH to RUNPATH (stability over flexibility) +common_linker_flags="-Wl,--disable-new-dtags" +CFLAGS=${CFLAGS:-""} +CXXFLAGS=${CXXFLAGS:-""} +LDFLAGS=${LDFLAGS:-""} +LDFLAGS="${common_linker_flags} ${LDFLAGS}" + +compiler_family=${COMPILER_FAMILY:-gnu} +case "${compiler_family,,}" in + gnu|gcc) + CC=${CC:-$(command -v gcc)} + CXX=${CXX:-$(command -v g++)} + EXTRA_LINK_FLAGS="-fuse-ld=gold ${common_linker_flags}" + ;; + clang) + CC=${CC:-$(command -v clang)} + CXX=${CXX:-$(command -v clang++)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" + ;; + amdclang) + CC=${CC:-$(command -v amdclang)} + CXX=${CXX:-$(command -v amdclang++)} + ROCM_VER=$(basename ${ROCM_PATH}) + ROCM_VER_NUM=$(echo "${ROCM_VER}" | tr -d '[a-z]') + COMPILER_VER="amdclang${ROCM_VER_NUM}" + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" + ;; + cray) + CC=${CC:-$(command -v cc)} + CXX=${CXX:-$(command -v CC)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" + ;; + craycc) + CC=${CC:-$(command -v craycc)} + CXX=${CXX:-$(command -v craycxx)} + EXTRA_LINK_FLAGS="-fuse-ld=lld ${common_linker_flags}" + ;; + *) + echo "Unknown compiler family: ${compiler_family}. Using gnu." + CC=${CC:-$(command -v gcc)} + CXX=${CXX:-$(command -v g++)} + EXTRA_LINK_FLAGS="-fuse-ld=gold ${common_linker_flags}" + ;; +esac + +# Set the compiler version based on the path of the compiler +COMPILER_VER=${COMPILER_VER:-"$(basename $(dirname $(dirname $(which ${CC}))))"} + +# HIP/CUDA configuration and launcher are platform-specific +CUDACXX=${CUDACXX:=""} +CUDAHOSTCXX=${CUDAHOSTCXX:=${CXX}} + +cuda_platform=OFF +rocm_platform=OFF + +launcher=mpiexec + +extra_rpaths=${extra_rpaths:-""} + +# Set to the preferred install directory for the external dependencies +CI_STABLE_DEPENDENCIES_ROOT=/usr/workspace/lbann/ci_stable_dependencies +INSTALL_EXTERNALS_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/${cluster} + +case "${cluster}" in + pascal) + CUDACXX=${CUDACXX:-$(command -v nvcc)} + CUDAHOSTCXX=${CUDAHOSTCXX:-${CXX}} + cuda_platform=ON + gpu_arch=60 + launcher=slurm + CUDA_VER=$(basename ${CUDA_HOME}) + SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/openmpi-4.1.2 + ;; + lassen) + CUDACXX=${CUDACXX:-$(command -v nvcc)} + CUDAHOSTCXX=${CUDAHOSTCXX:-${CXX}} + cuda_platform=ON + gpu_arch=70 + launcher=lsf + CUDA_VER=$(basename ${CUDA_HOME}) + SYSTEM_INSTALL_PREFIX_EXTERNALS=${CUDA_VER}/${COMPILER_VER}/spectrum-mpi-rolling-release + export CMAKE_PREFIX_PATH="${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/nccl_2.20.3-1+cuda12.2_ppc64le:${CI_STABLE_DEPENDENCIES_ROOT}/${cluster}/${CUDA_VER}/cudnn-linux-ppc64le-8.9.7.29_cuda12-archive:${CMAKE_PREFIX_PATH:-""}" + ;; + tioga) + cray_libs_dir=${CRAYLIBS_X86_64:-""} + if [[ -n "${cray_libs_dir}" ]] + then + extra_rpaths="${cray_libs_dir}:${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" + export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} + else + extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" + fi + rocm_platform=ON +# gpu_arch=gfx90a,gfx942 + gpu_arch=gfx90a + launcher=flux + ROCM_VER=$(basename ${ROCM_PATH}) + PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + case "${compiler_family,,}" in + craycc) + PE_ENV_lc=${PE_ENV_lc}cc + ;; + *) + ;; + esac + SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${PE_ENV_lc}/cray-mpich-${CRAY_MPICH_VERSION} + ;; + corona) + extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" + rocm_platform=ON + gpu_arch=gfx906 + launcher=flux + ROCM_VER=$(basename ${ROCM_PATH}) + SYSTEM_INSTALL_PREFIX_EXTERNALS=${ROCM_VER}/${COMPILER_VER}/openmpi-4.1.2 + ;; + *) + ;; +esac + +export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-""} +ci_core_cmake_prefix_path="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_sb_suggested_cmake_prefix_path.sh" +if [[ -e ${ci_core_cmake_prefix_path} ]]; then + source ${ci_core_cmake_prefix_path} +fi +if [[ "${build_half:-""}" = "ON" ]]; then + export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} +fi +case "${cluster}" in + tioga) + ROCM_VER=$(basename ${ROCM_PATH}) + if [[ "${ROCM_VER}" = "6.2.0" ]]; then + CMAKE_PREFIX_PATH=/p/vast1/lbann/stable_dependencies/${cluster}/rocm-6.2.0/miopen:${CMAKE_PREFIX_PATH} + fi + ;; + *) + ;; +esac +CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} + +# Improve debugging info and remove some misguided warnings. These are +# passed only to the LBANN stack. Add -v for debugging +EXTRA_CXX_FLAGS="-g3 -Wno-deprecated-declarations" +EXTRA_HIP_FLAGS="-g3 -Wno-deprecated-declarations" + +# Update the location of external packages +FWD_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/|} + +# Make sure the compilers and flags are exported +export CC CXX CUDACXX CUDAHOSTCXX CFLAGS CXXFLAGS LDFLAGS +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Environment Info" +echo "~~~~~" +echo "~~~~~ Cluster: ${cluster}" +echo "~~~~~ CUDA? ${cuda_platform}" +echo "~~~~~ ROCm? ${rocm_platform}" +echo "~~~~~ GPU arch: ${gpu_arch}" +echo "~~~~~ Launcher: ${launcher}" +echo "~~~~~" +echo "~~~~~ Compiler family: ${compiler_family}" +echo "~~~~~ Compiler version: ${COMPILER_VER}" +echo "~~~~~ CC: ${CC}" +echo "~~~~~ CXX: ${CXX}" +echo "~~~~~ CUDACXX: ${CUDACXX}" +echo "~~~~~ CUDAHOSTCXX: ${CUDAHOSTCXX}" +echo "~~~~~" +echo "~~~~~ CFLAGS: ${CFLAGS}" +echo "~~~~~ CXXFLAGS: ${CXXFLAGS}" +echo "~~~~~ LDFLAGS: ${LDFLAGS}" +echo "~~~~~ Extra rpaths: ${extra_rpaths}" +echo "~~~~~ CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" +echo "-----" +echo "----- Dependency Flags:" +echo "----- HALF: \"${build_half:-""}\"" +echo "----- DISTCONV: \"${build_distconv:-""}\"" +echo "----- FFT: \"${build_fft:-""}\"" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +export LBANN_PYTHON_VENV_DIR="${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/venv" +if [[ ! -e "${LBANN_PYTHON_VENV_DIR}/pyvenv.cfg" ]]; then + CMD="python3 -m venv --prompt lbann-venv ${LBANN_PYTHON_VENV_DIR}" + echo "${CMD}" + ${CMD} +fi +CMD="source ${LBANN_PYTHON_VENV_DIR}/bin/activate" +echo "${CMD}" +${CMD} + +if ! pip3 show ninja 1>/dev/null; then + CMD="python3 -m pip install ninja" + echo "${CMD}" + ${CMD} +fi diff --git a/.gitlab/tioga/pipeline.yml b/.gitlab/tioga/pipeline.yml index 7b17fc51949..708da83f520 100644 --- a/.gitlab/tioga/pipeline.yml +++ b/.gitlab/tioga/pipeline.yml @@ -1,5 +1,5 @@ ################################################################################ -## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. ## Produced at the Lawrence Livermore National Laboratory. ## Written by the LBANN Research Team (B. Van Essen, et al.) listed in ## the CONTRIBUTORS file. @@ -77,8 +77,10 @@ build and install: - !reference [.setup_spack, script] - flux proxy ${JOB_ID} flux run -N 1 -t 30m ./scripts/build_lbann.sh --ci ${SPACK_DEPS_FLAG} -l ${SPACK_ENV_NAME} -j ${BUILD_TASKS} ${CLEAN_BUILD_FLAG} + --superbuild-prefix /usr/workspace/lbann/ci_stable_dependencies --ci-pip -- - +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} + +deterministic +vision +numpy +unit_tests ${SPACK_SPECS} ^zlib + # lbann_pfe.sh -m ensurepip --upgrade - export TEST_TASKS_PER_NODE=4 - export TEST_MPIBIND_FLAG="--mpibind=off" - export SPACK_ARCH=$(flux proxy ${JOB_ID} flux run -N 1 spack arch) diff --git a/CMakeLists.txt b/CMakeLists.txt index d3f6a6d75a2..3030cde812d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ################################################################################ -## Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC. +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. ## Produced at the Lawrence Livermore National Laboratory. ## Written by the LBANN Research Team (B. Van Essen, et al.) listed in ## the CONTRIBUTORS file. @@ -427,6 +427,40 @@ if (LBANN_HAS_ROCM) endif () message(STATUS "Using LBANN_ROCM_PATH: ${LBANN_ROCM_PATH}") + # The Catch2 tests are only ever build artifacts, so this needs to + # change too. + if (CMAKE_BUILD_RPATH) + # The first of these actually matters; the last 4 are just good + # measure. + list(REMOVE_ITEM CMAKE_BUILD_RPATH + "${LBANN_ROCM_PATH}/lib" + "/usr/lib64" + "/usr/lib" + "/usr/local/lib64" + "/usr/local/lib" + ) + # Write the value out to the cache + set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" + CACHE STRING "The build rpath to use" + FORCE) + endif () + + if (CMAKE_INSTALL_RPATH) + # The first of these actually matters; the last 4 are just good + # measure. + list(REMOVE_ITEM CMAKE_INSTALL_RPATH + "${LBANN_ROCM_PATH}/lib" + "/usr/lib64" + "/usr/lib" + "/usr/local/lib64" + "/usr/local/lib" + ) + # Write the value out to the cache + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" + CACHE STRING "The install rpath to use" + FORCE) + endif () + find_package(hip CONFIG REQUIRED) enable_language(HIP) find_package(MIOpen CONFIG REQUIRED) @@ -554,7 +588,8 @@ endif (LBANN_WITH_PYTHON_FRONTEND OR LBANN_WITH_EMBEDDED_PYTHON) if (LBANN_WITH_PYTHON_FRONTEND) set(LBANN_PFE_PYTHON_EXECUTABLE "${Python_EXECUTABLE}" CACHE FILEPATH "") - set(LBANN_PFE_PYTHONPATH "" CACHE STRING "") + set(LBANN_PFE_PYTHONPATH "${PYTHON_INSTALL_PREFIX}/${CMAKE_INSTALL_PYTHONDIR}" CACHE STRING "") + set(AWS_OFI_RCCL_LIBDIR "$ENV{AWS_OFI_RCCL_LIBDIR}" CACHE STRING "") configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_pfe.sh.in" "${CMAKE_BINARY_DIR}/lbann_pfe.sh" @@ -1167,28 +1202,31 @@ else () set(LBANN_EXTRA_MODULE_LOADS "") endif () +if (DEFINED ENV{LBANN_PYTHON_VENV_DIR}) + set(LBANN_PYTHON_VENV_DIR "$ENV{LBANN_PYTHON_VENV_DIR}" CACHE STRING "") +else () + set(LBANN_PYTHON_VENV_DIR "" CACHE STRING "") +endif() + configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.lua.in" "${CMAKE_BINARY_DIR}/lbann_module.lua.install" @ONLY) -configure_file( - "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_module.tcl.in" - "${CMAKE_BINARY_DIR}/lbann_module.tcl.install") install(FILES "${CMAKE_BINARY_DIR}/lbann_module.lua.install" RENAME "${LBANN_MODULEFILE_NAME}" DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") -install(FILES "${CMAKE_BINARY_DIR}/lbann_module.tcl.install" - RENAME "${LBANN_VERSION}" - DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}/modulefiles/lbann") -install(CODE [===[ - message("\n********************************************************************************") - message("LBANN is installed into ${CMAKE_INSTALL_PREFIX} - to load the LBANN module:") - message(" module use ${CMAKE_INSTALL_PREFIX}/etc/modulefiles") - message(" module load lbann") - message("********************************************************************************") -]===]) - - +if (DEFINED ENV{LBANN_PYTHON_VENV_DIR}) +set(_PYTHON_VENV_INSTALL_MSG +"\nA Python virtual environment was used to build and install LBANN. + Loading the module will activate the environment in: ${LBANN_PYTHON_VENV_DIR}") +endif () +install(CODE " + message(\"\n********************************************************************************\") + message(\"LBANN is installed into ${CMAKE_INSTALL_PREFIX} - to load the LBANN module:\") + message(\" module use ${CMAKE_INSTALL_PREFIX}/etc/modulefiles\") + message(\" module load lbann ${_PYTHON_VENV_INSTALL_MSG}\") + message(\"********************************************************************************\") +") diff --git a/LICENSE b/LICENSE index 68681ede2ee..5d854378863 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. Produced at the Lawrence Livermore National Laboratory. Written by the LBANN Research Team (B. Van Essen, et al.) listed in the CONTRIBUTORS file. diff --git a/ci_test/integration_tests/test_integration_resnet50.py b/ci_test/integration_tests/test_integration_resnet50.py index f1e4e6dd21c..81ff05826ff 100644 --- a/ci_test/integration_tests/test_integration_resnet50.py +++ b/ci_test/integration_tests/test_integration_resnet50.py @@ -54,6 +54,8 @@ 'mini_batch_size': 256, 'expected_train_accuracy_range': (2.75, 4.25), # Decreased lower limit from 3.0 to 2.75 due to variance 'expected_test_accuracy_range': (1.5, 2.11), # BVE increased upper limit from 2.1 10/28 +# 2.144 - Pascal + # 1.446 - Corona 'fraction_of_data_to_use': imagenet_fraction * 0.01, 'expected_mini_batch_times': { 'pascal': 0.43, diff --git a/ci_test/requirements.txt b/ci_test/requirements.txt index 6ada807c10f..50787b5adb5 100644 --- a/ci_test/requirements.txt +++ b/ci_test/requirements.txt @@ -1,3 +1,4 @@ pytest +numpy==1.22.3 scipy tqdm diff --git a/ci_test/unit_tests/test_unit_reconstruction_loss.py b/ci_test/unit_tests/test_unit_reconstruction_loss.py index 3b6644f8cd4..0fd2921f801 100644 --- a/ci_test/unit_tests/test_unit_reconstruction_loss.py +++ b/ci_test/unit_tests/test_unit_reconstruction_loss.py @@ -18,6 +18,7 @@ def skeleton_jag_reconstruction_loss(cluster, dir_name, cluster=cluster, num_nodes=2, num_processes=32, + time_limit=3, disable_cuda=1, dir_name=dir_name, sample_list_train_default='/p/vast1/lbann/datasets/JAG/10MJAG/1M_A/100K4trainers/100Kindex.txt', diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 2d886995882..81aac0b40f8 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -85,9 +85,18 @@ whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") +if ("@LBANN_PYTHON_VENV_DIR@" ~= "") then + pushenv("LBANN_PYTHON_VENV_DIR","@LBANN_PYTHON_VENV_DIR@") + pushenv("LBANN_PYTHON_VENV_SITE_PACKAGES","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + prepend_path("PYTHONPATH","@LBANN_PYTHON_VENV_DIR@/@CMAKE_INSTALL_PYTHONDIR@") + execute {cmd="source @LBANN_PYTHON_VENV_DIR@/bin/activate", modeA={"load"}} + execute {cmd="deactivate", modeA={"unload"}} +end + prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") prepend_path("LD_LIBRARY_PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@") prepend_path("PYTHONPATH","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") +prepend_path("LBANN_PYTHON_SITE_PACKAGES","@PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@") pushenv("LBANN_DIR","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_DIR@") diff --git a/cmake/configure_files/lbann_module.tcl.in b/cmake/configure_files/lbann_module.tcl.in deleted file mode 100644 index e515c2c00b8..00000000000 --- a/cmake/configure_files/lbann_module.tcl.in +++ /dev/null @@ -1,83 +0,0 @@ -#%Module - -################################################################################ -## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. -## Produced at the Lawrence Livermore National Laboratory. -## Written by the LBANN Research Team (B. Van Essen, et al.) listed in -## the CONTRIBUTORS file. -## -## LLNL-CODE-697807. -## All rights reserved. -## -## This file is part of LBANN: Livermore Big Artificial Neural Network -## Toolkit. For details, see http://software.llnl.gov/LBANN or -## https://github.com/LLNL/LBANN. -## -## Licensed under the Apache License, Version 2.0 (the "Licensee"); you -## may not use this file except in compliance with the License. You may -## obtain a copy of the License at: -## -## http://www.apache.org/licenses/LICENSE-2.0 -## -## Unless required by applicable law or agreed to in writing, software -## distributed under the License is distributed on an "AS IS" BASIS, -## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -## implied. See the License for the specific language governing -## permissions and limitations under the license. -################################################################################ - -# Lua (and hence LMod) should be preferred, but this will -# satisfy... less modern system needs. - -set name lbann -set version @LBANN_VERSION@ -set root @CMAKE_INSTALL_PREFIX@ - -conflict $name - -set fullname LBANN -set url https://github.com/llnl/lbann -set docs https://lbann.readthedocs.io - -set description "LBANN: Livermore Big Artificial Neural Network Toolkit." - -proc ModulesHelp { } { - global description url docs - puts stderr "Description - $description" - puts stderr - puts stderr "Docs - $url" -} - -module-whatis "Package: LBANN -Version: @LBANN_VERSION@ -Description: Livermore Big Artificial Neural Network Toolkit. - A distributed memory, HPC-optimized, model and data parallel - training toolkit for deep neural networks. -URL: https://github.com/llnl/lbann -Configuration: - CMAKE_INSTALL_PREFIX: @CMAKE_INSTALL_PREFIX@ - CMAKE_BUILD_TYPE: @CMAKE_BUILD_TYPE@ - CXX Compiler: @CMAKE_CXX_COMPILER@ - CXX FLAGS: @CMAKE_CXX_FLAGS@ - CXX FLAGS_DEBUG: @CMAKE_CXX_FLAGS_DEBUG@ - CXX FLAGS_RELWITHDEBINFO: @CMAKE_CXX_FLAGS_RELWITHDEBINFO@ - CXX FLAGS_RELEASE: @CMAKE_CXX_FLAGS_RELEASE@ - LBANN_GNU_LINUX: @LBANN_GNU_LINUX@ - LBANN_HAS_HYDROGEN: @LBANN_HAS_HYDROGEN@ - LBANN_HAS_OPENCV: @LBANN_HAS_OPENCV@ - LBANN_HAS_CEREAL: @LBANN_HAS_CEREAL@ - LBANN_HAS_CUDA: @LBANN_HAS_CUDA@ - LBANN_HAS_CUDNN: @LBANN_HAS_CUDNN@ - LBANN_HAS_NCCL2: @LBANN_HAS_NCCL2@ - LBANN_HAS_PROTOBUF: @LBANN_HAS_PROTOBUF@ - LBANN_HAS_CNPY: @LBANN_HAS_CNPY@ - LBANN_HAS_TBINF: @LBANN_HAS_TBINF@ - LBANN_HAS_VTUNE: @LBANN_HAS_VTUNE@ - LBANN_NVPROF: @LBANN_NVPROF@ - LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ - LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ - LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ - LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@" - -prepend-path PATH $root/@CMAKE_INSTALL_BINDIR@ -prepend-path PYTHONPATH @PYTHON_INSTALL_PREFIX@/@CMAKE_INSTALL_PYTHONDIR@ diff --git a/cmake/configure_files/lbann_pfe.sh.in b/cmake/configure_files/lbann_pfe.sh.in index 402981f967c..d8a53b10b50 100644 --- a/cmake/configure_files/lbann_pfe.sh.in +++ b/cmake/configure_files/lbann_pfe.sh.in @@ -25,4 +25,8 @@ ################################################################################ #!/bin/sh export PYTHONPATH=@LBANN_PFE_PYTHONPATH@:${PYTHONPATH} +if [[ -e "@AWS_OFI_RCCL_LIBDIR@" ]]; then + export AWS_OFI_RCCL_LIBDIR=@AWS_OFI_RCCL_LIBDIR@ + export LD_LIBRARY_PATH=@AWS_OFI_RCCL_LIBDIR@:${LD_LIBRARY_PATH} +fi @LBANN_PFE_PYTHON_EXECUTABLE@ "$@" diff --git a/cmake/modules/SetupMPI.cmake b/cmake/modules/SetupMPI.cmake index f6491694a02..0f677866c11 100644 --- a/cmake/modules/SetupMPI.cmake +++ b/cmake/modules/SetupMPI.cmake @@ -26,9 +26,7 @@ # This module configures MPI and ensures the library is setup properly -if (NOT MPI_CXX_FOUND) - find_package(MPI REQUIRED COMPONENTS CXX) -endif () +find_package(MPI REQUIRED COMPONENTS C CXX) if (NOT TARGET MPI::MPI_CXX) add_library(MPI::MPI_CXX INTERFACE IMPORTED) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index 80ead930727..5a0231acaac 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -96,6 +96,7 @@ def prepend_environment_path(key, prefix): # Optimizations for Tioga if system in ('tioga', 'rzvernal'): #set_environment('NCCL_SOCKET_IFNAME', 'hsi') + set_environment('NCCL_NET_GDR_LEVEL', '3') # From HPE to avoid hangs set_environment('MIOPEN_DEBUG_DISABLE_FIND_DB', '0') set_environment('MIOPEN_DISABLE_CACHE', '0') tmpdir = os.environ.get('TMPDIR') diff --git a/scripts/build_lbann.sh b/scripts/build_lbann.sh index df9d091a356..5ff8b41c435 100755 --- a/scripts/build_lbann.sh +++ b/scripts/build_lbann.sh @@ -54,6 +54,8 @@ ALUMINUM_VER="@master" DIHYDROGEN_VER="@develop" # Default variants for Conduit to minimize dependencies CONDUIT_VARIANTS="~hdf5_compat~fortran~parmetis" +# User specified location for externals built with the superbuild +LBANN_SUPERBUILD_EXTERNAL_DIR="" ################################################################ # Help message @@ -98,6 +100,7 @@ Options: ${C}--tmp-build-dir${N} Put the build directory in tmp space ${C}--spec-only${N} Stop after a spack spec command ${C}-s | --stable${N} Use the latest stable defaults not the head of Hydrogen, DiHydrogen and Aluminum repos + ${C}--superbuild-prefix${N} Use the latest stable defaults not the head of Hydrogen, DiHydrogen and Aluminum repos ${C}--hydrogen-repo ${N} Use a local repository for the Hydrogen library ${C}--dihydrogen-repo ${N} Use a local repository for the DiHydrogen library ${C}--aluminum-repo ${N} Use a local repository for the Aluminum library @@ -233,6 +236,15 @@ while :; do ALUMINUM_VER="@1.0.0-lbann" DIHYDROGEN_VER= ;; + --superbuild-prefix) + if [ -n "${2}" ]; then + LBANN_SUPERBUILD_EXTERNAL_DIR=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + exit 1 + fi + ;; --hydrogen-repo) if [ -n "${2}" ]; then HYDROGEN_PATH=${2} @@ -356,6 +368,13 @@ fi # Detect system parameters CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +LOG="spack-build-${LBANN_ENV}.log" +if [[ -f ${LOG} ]]; then + CMD="rm ${LOG}" + echo ${CMD} + [[ -z "${DRY_RUN:-}" ]] && ${CMD} +fi + # Identify the center that we are running at CENTER= # Customize the build based on the center @@ -384,13 +403,6 @@ else AT_LBANN_LABEL="" fi -LOG="spack-build-${LBANN_ENV}.log" -if [[ -f ${LOG} ]]; then - CMD="rm ${LOG}" - echo ${CMD} - [[ -z "${DRY_RUN:-}" ]] && ${CMD} -fi - LBANN_BUILD_LABEL="lbann_${CLUSTER}_${LBANN_LABEL}" LBANN_BUILD_PARENT_DIR="${LBANN_HOME}/builds/${LBANN_BUILD_LABEL}" LBANN_BUILD_DIR="${LBANN_BUILD_PARENT_DIR}/build" @@ -495,6 +507,22 @@ if [[ "${LBANN_VARIANTS}" =~ (.*)(%[0-9a-zA-Z:\.@]+)(.*) ]]; then LBANN_VARIANTS="${BASH_REMATCH[1]} ${BASH_REMATCH[3]}" fi +# Here is a fairly brittle way to find the DiHydrogen, Hydrogen, and Aluminum superbuilds +LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="dha" +if [[ "${LBANN_VARIANTS}" =~ .*"+distconv".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_distconv" +fi +if [[ "${LBANN_VARIANTS}" =~ .*"+half".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_half" +fi +if [[ "${LBANN_VARIANTS}" =~ .*"+nvshmem".* ]]; then + # If the user didn't supply a specific version of Hydrogen on the command line add one + LBANN_SUPERBUILD_EXTERNAL_DHA_DIR="${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}_with_nvshmem" +fi + + if [[ "${CENTER_COMPILER}" =~ .*"%clang".* ]]; then # If the compiler is clang use the LLD fast linker CENTER_LINKER_FLAGS="+lld" @@ -777,7 +805,7 @@ if [[ -z "${CONFIG_FILE_NAME}" ]]; then # See if there are any center-specific externals SPACK_ENV_YAML_FILE="${SPACK_ROOT}/var/spack/environments/${LBANN_ENV}/spack.yaml" - CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR}" + CMD="set_center_specific_externals ${CENTER} ${SPACK_ARCH_TARGET} ${SPACK_ARCH} ${SPACK_ENV_YAML_FILE} ${LBANN_MODFILES_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DIR} ${LBANN_SUPERBUILD_EXTERNAL_DHA_DIR}" echo ${CMD} | tee -a ${LOG} [[ -z "${DRY_RUN:-}" ]] && { ${CMD} || exit_on_failure "${CMD}"; } @@ -1066,7 +1094,10 @@ export PATH=\${PATH}:\${LBANN_CMAKE_DIR}:\${LBANN_NINJA_DIR}:\${LBANN_PYTHON_DIR export PYTHONPATH=\${LBANN_PYTHONPATH}:\${PYTHONPATH} EOF +BUILD_MODULES= if [[ -n "${MODULE_CMD}" ]]; then + BUILD_MODULES=${MODULE_CMD//module load /} + BUILD_MODULES=${BUILD_MODULES// /;} cat >> ${LBANN_SETUP_FILE}<> ${yaml} @@ -319,8 +336,8 @@ cat <> ${yaml} modules: - mvapich2/2.3.7 EOF - set_superbuild_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" - set_superbuild_DHA_externals "pascal" "cuda-11.8.0" "openmpi-4.1.2" "$yaml" "${LOG}" + set_superbuild_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "cuda-11.8.0" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; "power9le" | "power8le") cat <> ${yaml} @@ -333,9 +350,9 @@ cat <> ${yaml} - spec: rdma-core@20 arch=${spack_arch} prefix: /usr EOF - set_superbuild_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" - set_superbuild_DHA_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" - set_superbuild_power_externals "lassen" "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${LOG}" + # set_superbuild_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" + # set_superbuild_DHA_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" "${dha_dir}" + # set_superbuild_power_externals ${host} "cuda-11.8.0" "spectrum-mpi-rolling-release" "$yaml" "${prefix}" ;; "zen" | "zen2") @@ -344,25 +361,25 @@ cat <> ${yaml} hipcub: buildable: false version: - - '5.7.0' + - '${CI_ROCM_VER}' externals: - - spec: hipcub@5.7.0 arch=${spack_arch} - prefix: /opt/rocm-5.7.0/hipcub + - spec: hipcub@${CI_ROCM_VER} arch=${spack_arch} + prefix: /opt/rocm-${CI_ROCM_VER}/hipcub extra_attributes: compilers: - c: /opt/rocm-5.7.0/llvm/bin/clang - c++: /opt/rocm-5.7.0/llvm/bin/clang++ + c: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang + c++: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang++ llvm-amdgpu: buildable: false version: - - '5.7.0' + - '${CI_ROCM_VER}' externals: - - spec: llvm-amdgpu@5.7.0 arch=${spack_arch} - prefix: /opt/rocm-5.7.0/llvm + - spec: llvm-amdgpu@${CI_ROCM_VER} arch=${spack_arch} + prefix: /opt/rocm-${CI_ROCM_VER}/llvm extra_attributes: compilers: - c: /opt/rocm-5.7.0/llvm/bin/clang - c++: /opt/rocm-5.7.0/llvm/bin/clang++ + c: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang + c++: /opt/rocm-${CI_ROCM_VER}/llvm/bin/clang++ openmpi: buildable: false version: @@ -373,11 +390,17 @@ cat <> ${yaml} - openmpi/4.1.2 EOF - set_superbuild_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" - set_superbuild_DHA_externals "corona" "rocm-5.7.0" "openmpi-4.1.2" "$yaml" "${LOG}" + set_superbuild_externals ${host} "rocm-${CI_ROCM_VER}" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-${CI_ROCM_VER}" "clang-14.0.6-magic" "openmpi-4.1.2" "$yaml" "${prefix}" "${dha_dir}" ;; - "zen3") + "zen3" | "zen4") + if [[ ${host} == "rzvernal" || ${host} =~ "rzadams" ]]; then + if [[ -z ${prefix} ]]; then + # Override the prefix path for this system + prefix="/usr/workspace/lbann/stable_dependencies" + fi + fi cat <> ${yaml} compilers: - compiler: @@ -393,15 +416,15 @@ cat <> ${yaml} operating_system: rhel8 target: any modules: - - PrgEnv-cray/8.4.0 - - cce/17.0.0 + - PrgEnv-amd + - amd/5.7.1 - rocm/5.7.1 environment: {} - extra_rpaths: - - /opt/cray/pe/cce/17.0.0/cce/x86_64/lib - - /opt/cray/pe/cce/17.0.0/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu + # extra_rpaths: + # - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib + # - /opt/cray/pe/cce/17.0.1/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu - compiler: - spec: cce@17.0.0 + spec: cce@17.0.1 paths: cc: craycc cxx: crayCC @@ -412,12 +435,12 @@ cat <> ${yaml} target: any modules: - PrgEnv-cray - - cce/17.0.0 + - cce/17.0.1 - rocm/5.7.1 environment: {} extra_rpaths: - - /opt/cray/pe/cce/17.0.0/cce/x86_64/lib - - /opt/cray/pe/cce/17.0.0/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu + - /opt/cray/pe/cce/17.0.1/cce/x86_64/lib + - /opt/cray/pe/cce/17.0.1/cce-clang/x86_64/lib/x86_64-unknown-linux-gnu packages: all: require: @@ -448,23 +471,29 @@ cat <> ${yaml} cray-libsci: buildable: false version: - - '23.09.1.1' + - '24.03.0' externals: - - spec: cray-libsci@23.09.1.1 %rocmcc arch=${spack_arch} + - spec: cray-libsci@24.03.0 %rocmcc arch=${spack_arch} modules: - - cce/17.0.0 PrgEnv-cray cray-libsci/23.09.1.1 +# - amd/5.7.1 PrgEnv-amd cray-libsci/24.03.0 +# - amd/5.7.1 PrgEnv-amd cray-libsci/23.09.1.1 + - cce/17.0.1 PrgEnv-cray cray-libsci/24.03.0 cray-mpich: buildable: false version: - - '8.1.28' + - '8.1.29' externals: - - spec: cray-mpich@8.1.28 %rocmcc arch=${spack_arch} + - spec: cray-mpich@8.1.29 %rocmcc arch=${spack_arch} modules: - - cce/17.0.0 PrgEnv-cray cray-mpich/8.1.28 + - amd/5.7.1 PrgEnv-amd cray-mpich/8.1.29 +# - cce/17.0.1 PrgEnv-cray cray-mpich/8.1.29 EOF - set_superbuild_externals "tioga" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" - set_superbuild_DHA_externals "tioga" "rocm-5.7.1" "cray-mpich-8.1.28" "$yaml" "${LOG}" - + PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') + echo "BVE Using the Cray programming environment ${PE_ENV_lc}" + set_superbuild_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${prefix}" + set_superbuild_DHA_externals ${host} "rocm-5.7.1" "${PE_ENV_lc}" "cray-mpich-8.1.29" "$yaml" "${prefix}" "${dha_dir}" + # set_superbuild_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${LOG}" "${prefix}" "mi300a" + # set_superbuild_DHA_externals ${host} "rocm-6.0.3" "cray-mpich-8.1.28" "$yaml" "${prefix}" "mi300a" ;; *) echo "No center-specified externals." @@ -708,7 +737,7 @@ set_center_specific_variants() "ivybridge") # Catalyst CENTER_USER_VARIANTS="+onednn" ;; - "zen" | "zen2") # Corona + "zen" | "zen2" | "zen3" | "zen4") # Corona CENTER_USER_VARIANTS="+rocm" ;; *) diff --git a/scripts/superbuild/CMakeLists.txt b/scripts/superbuild/CMakeLists.txt index 463d386c447..549add11d2d 100644 --- a/scripts/superbuild/CMakeLists.txt +++ b/scripts/superbuild/CMakeLists.txt @@ -88,6 +88,7 @@ lbann_sb_add_packages( Clara CNPY Conduit + hiptt Hydrogen # DiHydrogen depends on H. DiHydrogen OpenCV @@ -115,7 +116,7 @@ foreach (pkg ${LBANN_SB_BUILD_PKGS}) endforeach () # Print a helpful(?) message -set(LBANN_SB_SUGG_CMAKE_PREFIX_PATH_TMP "\$\{CMAKE_PREFIX_PATH\}") +set(LBANN_SB_SUGG_CMAKE_PREFIX_PATH_TMP "${CMAKE_PREFIX_PATH}" "\$\{CMAKE_PREFIX_PATH\}") message("\n-----------------------------------------------------------------\n") message("LBANN SuperBuild will build the following packages:\n") foreach (pkg ${LBANN_SB_BUILD_PKGS}) @@ -142,6 +143,8 @@ message("Note that these assume a Bourne-compatible shell.") message("\n-----------------------------------------------------------------\n") file(WRITE "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" "export CMAKE_PREFIX_PATH=${LBANN_SB_SUGG_CMAKE_PREFIX_PATH}\n") +file(WRITE "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export CMAKE_PREFIX_PATH=${LBANN_SB_SUGG_CMAKE_PREFIX_PATH}\n") if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("-----------------------------------------------------------------\n") @@ -150,6 +153,26 @@ if (LBANN_SB_BUILD_AWS_OFI_RCCL) message("have an effect. It may be useful to do the following:\n") message("export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBARY_PATH\}\n") message("-----------------------------------------------------------------\n") + file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBDIR=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + file(APPEND "${CMAKE_BINARY_DIR}/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") + file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export AWS_OFI_RCCL_LIBDIR=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib\n") + file(APPEND "${CMAKE_INSTALL_PREFIX}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + "export LD_LIBRARY_PATH=${LBANN_SB_AWS_OFI_RCCL_PREFIX}/lib:\$\{LD_LIBRARY_PATH\}\n") +endif () + +if (LBANN_SB_FWD_LBANN_LBANN_WITH_PYTHON_FRONTEND) + message("-----------------------------------------------------------------\n") + message("LBANN was built with support for the Python Front End (PFE) (If you need to install it via pip you can in the LBANN site-packages with):") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} pytest") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} protobuf") + if (LBANN_SB_FWD_LBANN_LBANN_WITH_CNPY) + message("\nLBANN was built with support for the NumPy (If you need to install it via pip you can in the LBANN site-packages with):") + message(" python3 -m pip install --target \$\{LBANN_PYTHON_SITE_PACKAGES\} numpy") + endif () + message("\n-----------------------------------------------------------------\n") endif () # Add a custom target for bundling all things up diff --git a/scripts/superbuild/ci/ci_core_dependencies.sh b/scripts/superbuild/ci/ci_core_dependencies.sh new file mode 100755 index 00000000000..fe17d2d9ab3 --- /dev/null +++ b/scripts/superbuild/ci/ci_core_dependencies.sh @@ -0,0 +1,182 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ + +# Example usage: COMPILER_FAMILY= ./scripts/superbuild/ci/ci_core_dependencies.sh +# Initialize modules for users not using bash as a default shell +modules_home=${MODULESHOME:-"/usr/share/lmod/lmod"} +if [[ -e ${modules_home}/init/bash ]] +then + source ${modules_home}/init/bash +fi + +set -o errexit +set -o nounset + +hostname="$(hostname)" +cluster=${hostname//[0-9]/} +project_dir="$(git rev-parse --show-toplevel)" +if [[ $? -eq 1 ]] +then + project_dir="$(pwd)" +fi + +# Finish setting up the environment +source ${project_dir}/.gitlab/setup_env.sh + +# Set to ON (or any CMake truthy value) to build all of the +# dependencies of the LBANN stack +BUILD_EXTERNAL_TPLS=ON + +case "${cluster}" in + tioga) + # Set to ON if you're on a Cray machine that doesn't provide the AWS + # plugin as part of its default RCCL installation. + # + # It might also be advisable to build this if you build a custom RCCL. + # The configuration script takes a RCCL path as a parameter, so it + # could matter, but it's not clear how much. + aws_ofi_plugin="-D LBANN_SB_BUILD_AWS_OFI_RCCL=ON" + BUILD_ROCM_TPLS="ON" + ;; + corona) + BUILD_ROCM_TPLS="ON" + ;; + lassen) + power9_flags="-D LBANN_SB_OpenCV_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ + -D LBANN_SB_OpenCV_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_FWD_OpenCV_WITH_OPENJPEG=OFF \ + -D LBANN_SB_FWD_OpenCV_WITH_IPP=OFF \ + \ + -D LBANN_SB_BUILD_OpenBLAS=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_OpenBLAS_C_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gcc \ + -D LBANN_SB_OpenBLAS_CXX_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/g++ \ + -D LBANN_SB_OpenBLAS_Fortran_COMPILER=/usr/tce/packages/gcc/gcc-11.2.1/bin/gfortran" + ;; + *) + ;; +esac + +# Set to the directory with the top-level CMakeLists.txt file for LBANN +LBANN_SRC_DIR=$(git rev-parse --show-toplevel) + +# Set to the directory with the top-level SuperBuild CMakeLists.txt file +SUPERBUILD_SRC_DIR=${LBANN_SRC_DIR}/scripts/superbuild + +# Setup the common environment +#source ${SUPERBUILD_SRC_DIR}/ci/ci_tioga_env.sh + +# Use an accessible build directory so that the source files are preserved for debuggin +BUILD_ROOT=${CI_STABLE_DEPENDENCIES_ROOT}/.build/${cluster}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} + +# Set to the preferred install directory +INSTALL_PREFIX=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} + +if [ ! -e ${INSTALL_PREFIX} ]; then + mkdir -p ${INSTALL_PREFIX} +fi + +# Set to the preferred build directory +BUILD_DIR=${BUILD_ROOT}/lbann-superbuild-core-dependencies + +# The compilers are set via CC, CXX environment variables +cmake \ + -G Ninja \ + -S ${SUPERBUILD_SRC_DIR} \ + -B ${BUILD_DIR} \ + \ + -D CMAKE_PREFIX_PATH=${CMAKE_CMAKE_PREFIX_PATH} \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -D CMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \ + -D CMAKE_BUILD_RPATH="${extra_rpaths//:/|}" \ + -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/|}" \ + \ + -D BUILD_SHARED_LIBS=ON \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + -D CMAKE_EXE_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + -D CMAKE_SHARED_LINKER_FLAGS="${EXTRA_LINK_FLAGS}" \ + \ + -D CMAKE_CXX_STANDARD=17 \ + -D CMAKE_CUDA_STANDARD=17 \ + -D CMAKE_HIP_STANDARD=17 \ + \ + -D CMAKE_CUDA_ARCHITECTURES=${gpu_arch} \ + -D CMAKE_HIP_ARCHITECTURES=${gpu_arch} \ + \ + -D CMAKE_POSITION_INDEPENDENT_CODE=ON \ + \ + -D LBANN_SB_DEFAULT_INSTALL_PATH_STRATEGY="PKG_LC" \ + -D LBANN_SB_DEFAULT_CUDA_OPTS=${cuda_platform} \ + -D LBANN_SB_DEFAULT_ROCM_OPTS=${rocm_platform} \ + \ + -D LBANN_SB_BUILD_adiak=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Caliper=ON \ + -D LBANN_SB_adiak_BUILD_SHARED_LIBS=ON \ + -D LBANN_SB_Caliper_BUILD_SHARED_LIBS=ON \ + \ + -D LBANN_SB_BUILD_Catch2=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_cereal=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_Clara=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_CNPY=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hiptt=${BUILD_ROCM_TPLS:-"OFF"} \ + -D LBANN_SB_BUILD_protobuf=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_spdlog=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_zstr=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_hwloc=${BUILD_EXTERNAL_TPLS} \ + \ + -D LBANN_SB_BUILD_Conduit=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_HDF5=${BUILD_EXTERNAL_TPLS} \ + \ + ${aws_ofi_plugin:-""} \ + \ + -D LBANN_SB_BUILD_JPEG-TURBO=${BUILD_EXTERNAL_TPLS} \ + -D LBANN_SB_BUILD_OpenCV=${BUILD_EXTERNAL_TPLS} \ + ${power9_flags:=""} \ + -D LBANN_SB_OpenCV_TAG=4.x + +# Save a list of the currently loaded modules +if [ ! -e ${INSTALL_PREFIX}/logs ]; then + mkdir -p ${INSTALL_PREFIX}/logs +fi +module -t list 2> ${INSTALL_PREFIX}/logs/modules.txt + +pushd ${BUILD_DIR} +ninja +popd + +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +echo "~~~~~ Installing Python Packages with PIP" +echo "~~~~~ $(date)" +echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + +for p in pytest protobuf tqdm numpy scipy; +do + if ! pip3 show ${p} 1>/dev/null; then + CMD="python3 -m pip install -i https://pypi.org/simple ${p}" + echo ${CMD} + ${CMD} + fi +done diff --git a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake index 5794aacada0..2af0a86f87a 100644 --- a/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake +++ b/scripts/superbuild/cmake/modules/LBANNSuperBuildAddCMakeExternPkg.cmake @@ -228,6 +228,7 @@ macro(lbann_sb_add_cmake_extern_pkg) set(LBANN_SB_DEPENDS_TAG "DEPENDS") string(REPLACE ";" "|" LBANN_SB_FWD_${PKG_NAME}_CMAKE_PREFIX_PATH + "${LBANN_SB_FWD_${PKG_NAME}_CMAKE_PREFIX_PATH};" "${LBANN_SB_${PKG_NAME}_DEPENDS_PATHS}") message(STATUS "${PKG_NAME} depends on: ${LBANN_SB_${PKG_NAME}_DEPENDS}") endif () diff --git a/scripts/superbuild/hiptt/CMakeLists.txt b/scripts/superbuild/hiptt/CMakeLists.txt new file mode 100644 index 00000000000..3309f8d2398 --- /dev/null +++ b/scripts/superbuild/hiptt/CMakeLists.txt @@ -0,0 +1,78 @@ +################################################################################ +## Copyright (c) 2014-2024, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ +lbann_sb_init_extern_pkg( + NAME hiptt + LANGUAGES C CXX HIP + GITHUB_URL tbennun/hipTT.git + GIT_TAG "master") + +# The build system here is just a set of makefiles. +find_program(GNU_MAKE_PROGRAM make) + +include (ExternalProject) +ExternalProject_Add(${PKG_NAME} + PREFIX "${CMAKE_CURRENT_BINARY_DIR}" + ${LBANN_SB_GIT_REPOSITORY_TAG} ${LBANN_SB_${PKG_NAME}_URL} + ${LBANN_SB_GIT_TAG_TAG} ${LBANN_SB_${PKG_NAME}_TAG} + TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp" + STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp" + + SOURCE_DIR "${LBANN_SB_${PKG_NAME}_SOURCE_DIR}" + INSTALL_DIR "${LBANN_SB_${PKG_NAME}_PREFIX}" + + GIT_SHALLOW 1 + + BUILD_IN_SOURCE 1 + USES_TERMINAL_BUILD 1 + LOG_DOWNLOAD 1 + LOG_UPDATE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + LOG_TEST 1 + + CONFIGURE_COMMAND "" + + BUILD_COMMAND + ${GNU_MAKE_PROGRAM} + "PREFIX=${LBANN_SB_${PKG_NAME}_PREFIX}" + "CC=${LBANN_SB_${PKG_NAME}_C_COMPILER}" + "CXX=${LBANN_SB_${PKG_NAME}_CXX_COMPILER}" + -j${${PKG_NAME}_MAX_MAKE_JOBS} + + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory + ${LBANN_SB_${PKG_NAME}_SOURCE_DIR}/include + ${LBANN_SB_${PKG_NAME}_PREFIX}/include + + COMMAND + ${CMAKE_COMMAND} -E copy_directory + ${LBANN_SB_${PKG_NAME}_SOURCE_DIR}/lib + ${LBANN_SB_${PKG_NAME}_PREFIX}/lib + +) + +set(${PKG_NAME}_DIR ${LBANN_SB_${PKG_NAME}_PREFIX} + CACHE INTERNAL "The install prefix of ${PKG_NAME}.") diff --git a/scripts/superbuild_externals.sh b/scripts/superbuild_externals.sh index dd48c635c9b..aeb9c9cc6a3 100644 --- a/scripts/superbuild_externals.sh +++ b/scripts/superbuild_externals.sh @@ -2,11 +2,18 @@ set_superbuild_externals() { local system="$1" local dnn_lib="$2" - local mpi="$3" - local yaml="$4" - local LOG="$5" + local compiler_ver="$3" + local mpi="$4" + local yaml="$5" + local prefix="$6" + local gpu_arch="$7" - CMD="source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi + + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" + CMD="source ${prefix}/${sb_extra_prefix}/logs/lbann_sb_suggested_cmake_prefix_path.sh" echo ${CMD} | tee -a ${LOG} ${CMD} @@ -17,135 +24,178 @@ cat <> ${yaml} - 'master' externals: - spec: adiak@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/adiak + prefix: ${prefix}/${sb_extra_prefix}/adiak caliper: buildable: false version: - 'master' externals: - spec: caliper@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/caliper + prefix: ${prefix}/${sb_extra_prefix}/caliper catch2: buildable: false version: - '2.9.2' externals: - spec: catch2@2.9.2 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/catch2 + prefix: ${prefix}/${sb_extra_prefix}/catch2 + half: + buildable: false + version: + - '2.1.0' + externals: + - spec: half@2.1.0 arch=${spack_arch} + prefix: ${prefix}/half-2.1.0 hdf5: buildable: false version: - '1.10.9' externals: - spec: hdf5@1.10.9 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hdf5 + prefix: ${prefix}/${sb_extra_prefix}/hdf5 jpeg-turbo: buildable: false version: - '2.0.3' externals: - spec: jpeg-turbo@2.0.3 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/jpeg-turbo + prefix: ${prefix}/${sb_extra_prefix}/jpeg-turbo spdlog: buildable: false version: - '1.12.0' externals: - spec: spdlog@1.12.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/spdlog + prefix: ${prefix}/${sb_extra_prefix}/spdlog cereal: buildable: false version: - '1.3.0' externals: - spec: cereal@1.3.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/cereal + prefix: ${prefix}/${sb_extra_prefix}/cereal clara: buildable: false version: - '1.1.5' externals: - spec: clara@1.1.5 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/clara + prefix: ${prefix}/${sb_extra_prefix}/clara cnpy: buildable: false version: - 'master' externals: - spec: cnpy@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/cnpy + prefix: ${prefix}/${sb_extra_prefix}/cnpy conduit: buildable: false version: - 'develop' externals: - spec: conduit@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/conduit - hiptt: - buildable: false - version: - - 'master' - externals: - - spec: hiptt@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hiptt + prefix: ${prefix}/${sb_extra_prefix}/conduit opencv: buildable: false version: - '4.1.0' externals: - spec: opencv@4.1.0 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/opencv + prefix: ${prefix}/${sb_extra_prefix}/opencv protobuf: buildable: false version: - '3.21.5' externals: - - spec: protobuf@3.21.5 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/protobuf + - spec: protobuf@3.21.5+shared arch=${spack_arch} +# - spec: protobuf@3.21.5~shared arch=${spack_arch} + prefix: ${prefix}/${sb_extra_prefix}/protobuf zstr: buildable: false version: - 'master' externals: - spec: zstr@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/zstr + prefix: ${prefix}/${sb_extra_prefix}/zstr +EOF + + if [[ ${dnn_lib} =~ "rocm" ]]; then +cat <> ${yaml} + hwloc: + buildable: false + version: + - '3.0.0' + externals: + - spec: hwloc@3.0.0 arch=${spack_arch} + prefix: ${prefix}/${sb_extra_prefix}/hwloc + aws-ofi-rccl: + buildable: false + version: + - 'cxi' + externals: + - spec: aws-ofi-rccl@cxi arch=${spack_arch} + prefix: ${prefix}/${sb_extra_prefix}/aws_ofi_rccl + hiptt: + buildable: false + version: + - 'master' + externals: + - spec: hiptt@master arch=${spack_arch} + prefix: ${prefix}/${sb_extra_prefix}/hiptt +EOF + fi + + if [[ ${dnn_lib} =~ "cuda" ]]; then +cat <> ${yaml} nccl: buildable: false version: - '2.19.4' externals: - spec: nccl@2.19.4 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/nccl + prefix: ${prefix}/${sb_extra_prefix}/nccl cudnn: buildable: false version: - '8.9.4' externals: - spec: cudnn@8.9.4 arch=linux-rhel8-broadwell - prefix: /p/vast1/lbann/stable_dependencies/cudnn-8.9.4/cuda_11_x86_64 + prefix: ${prefix}/cudnn-8.9.4/cuda_11_x86_64 - spec: cudnn@8.9.4 arch=linux-rhel7-power9le - prefix: /p/vast1/lbann/stable_dependencies/cudnn-8.9.4/cuda_11_ppc64le + prefix: ${prefix}/cudnn-8.9.4/cuda_11_ppc64le cutensor: buildable: false version: - '1.7.0.1' externals: - spec: cutensor@1.7.0.1 arch=linux-rhel8-broadwell - prefix: /p/vast1/lbann/stable_dependencies/cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive + prefix: ${prefix}/cutensor-1.7.0.1/libcutensor-linux-x86_64-1.7.0.1-archive - spec: cutensor@1.7.0.1 arch=linux-rhel7-power9le - prefix: /p/vast1/lbann/stable_dependencies/cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive + prefix: ${prefix}/cutensor-1.7.0.1/libcutensor-linux-ppc64le-1.7.0.1-archive EOF + fi } set_superbuild_DHA_externals() { local system="$1" local dnn_lib="$2" - local mpi="$3" - local yaml="$4" + local compiler_ver="$3" + local mpi="$4" + local yaml="$5" + local prefix="$6" + local dha_dir="$7" + local gpu_arch="$8" - source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi + + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" + CMD="source ${prefix}/${sb_extra_prefix}/${dha_dir}/logs/lbann_sb_suggested_cmake_prefix_path.sh" + echo ${CMD} | tee -a ${LOG} + ${CMD} cat <> ${yaml} aluminum: @@ -154,21 +204,21 @@ cat <> ${yaml} - 'master' externals: - spec: aluminum@master arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/aluminum + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/aluminum hydrogen: buildable: false version: - 'develop' externals: - spec: hydrogen@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/hydrogen + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/hydrogen dihydrogen: buildable: false version: - 'develop' externals: - spec: dihydrogen@develop arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/dihydrogen + prefix: ${prefix}/${sb_extra_prefix}/${dha_dir}/dihydrogen EOF } @@ -176,10 +226,17 @@ set_superbuild_power_externals() { local system="$1" local dnn_lib="$2" - local mpi="$3" - local yaml="$4" + local compiler_ver="$3" + local mpi="$4" + local yaml="$5" + local prefix="$6" + local gpu_arch="$7" + + if [ -n "${gpu_arch}" ]; then + dnn_lib="${dnn_lib}/${gpu_arch}" + fi - source /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/logs/lbann_sb_suggested_cmake_prefix_path.sh + local sb_extra_prefix="${system}/${dnn_lib}/${compiler_ver}/${mpi}" cat <> ${yaml} openblas: @@ -188,6 +245,6 @@ cat <> ${yaml} - '0.3.6' externals: - spec: openblas@0.3.6 arch=${spack_arch} - prefix: /p/vast1/lbann/stable_dependencies/${system}/${dnn_lib}/${mpi}/openblas + prefix: ${prefix}/${sb_extra_prefix}/openblas EOF } diff --git a/scripts/utilities.sh b/scripts/utilities.sh index 97445aeb541..e3bf1a9561f 100644 --- a/scripts/utilities.sh +++ b/scripts/utilities.sh @@ -17,6 +17,13 @@ osx_realpath() { [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}" } +function host_basename() { + HOST=$(hostname) + HOST=${HOST//[[:digit:]]/} + HOST=${HOST//\-/} + echo ${HOST} +} + function normpath() { # Remove all /./ sequences. local path=${1//\/.\//\/} diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index 1e9e0f2cb84..82923533948 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -246,7 +246,7 @@ void check_gradients::do_check_gradients(model& m) const auto& c = static_cast(m.get_execution_context()); auto& comm = *m.get_comm(); const auto mode = c.get_execution_mode(); - const auto& layers = m.get_layers(); + // const auto& layers = m.get_layers(); // Return immediately if gradient check isn't currently needed if (!m_modes.empty() && m_modes.count(mode) == 0) { diff --git a/src/callbacks/check_metric.cpp b/src/callbacks/check_metric.cpp index 611e2b16bfc..b3e8f8f79cd 100644 --- a/src/callbacks/check_metric.cpp +++ b/src/callbacks/check_metric.cpp @@ -37,6 +37,7 @@ #include "lbann/proto/callbacks.pb.h" +#include #include #include #include @@ -59,11 +60,19 @@ check_metric::check_metric(std::string metric_name, if (lower_bound > upper_bound) { std::stringstream err; err << "callback \"" << name() << "\" " - << "got an invalid range for metric values " + << "got an invalid range for metric values " << std::setprecision(9) << "(lower bound " << m_lower_bound << ", " << "upper bound " << m_upper_bound << ")"; LBANN_ERROR(err.str()); } + if (lower_bound == upper_bound) { + std::stringstream err; + err << "callback \"" << name() << "\" " + << "got an zero range for metric values " << std::setprecision(9) + << "(lower bound " << m_lower_bound << " == " + << "upper bound " << m_upper_bound << ")"; + LBANN_WARNING(err.str()); + } } check_metric::check_metric() : check_metric("", {}, 0, 0, false) {} @@ -122,8 +131,8 @@ void check_metric::do_check_metric(const model& m) const if (!(m_lower_bound <= value && value <= m_upper_bound)) { err << "callback \"" << name() << "\" expected " << "metric \"" << m_metric_name << "\" " - << "to have a value in range " - << "[" << m_lower_bound << "," << m_upper_bound << "], " + << "to have a value in range " << std::setprecision(9) << "[" + << m_lower_bound << "," << m_upper_bound << "], " << "but found a value of " << value; if (m_error_on_failure) { LBANN_ERROR(err.str()); diff --git a/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp b/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp index 6d18a431f71..a80dba25d00 100644 --- a/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp +++ b/src/execution_algorithms/ltfb/truncation_selection_exchange.cpp @@ -150,6 +150,14 @@ void TruncationSelectionExchange::select_next(model& m, data_coordinator& dc) const { auto const& comm = *(m.get_comm()); + + El::mpi::EnsureComm( + comm.get_world_comm(), + El::SyncInfo{}); + El::mpi::EnsureComm( + comm.get_world_comm(), + El::SyncInfo{}); + const unsigned int num_trainers = comm.get_num_trainers(); const unsigned int trainer_id = comm.get_trainer_rank(); auto const step = ctxt.get_step();