From 271df555df62ce0e4ee56c84c87e7eb4b537de17 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 Oct 2023 18:50:44 +0000 Subject: [PATCH 01/19] Add special exit 5 to cause localization to be retried --- canine/localization/base.py | 9 +++++---- canine/orchestrator.py | 7 ++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 40c88a3..a5218d7 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -812,7 +812,7 @@ def create_persistent_disk(self, ## attach as read-write, using same device-name as disk-name 'if [[ ! -e /dev/disk/by-id/google-${GCP_DISK_NAME} ]]; then', - 'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || true', + 'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || { [ $? == 5 ] && exit 5 || true; }', 'fi', ## wait for disk to attach, with exponential backoff up to 2 minutes @@ -1285,8 +1285,9 @@ def sensitive_ext_extract(basename): "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then", # we can run into a race condition here if other tasks are # attempting to mount the same disk simultaneously, so we - # force a 0 exit - "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || true", + # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded), + # which we explicitly propagate to cause localization to be retried + "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", "fi", # mount the disk if it's not already @@ -1312,7 +1313,7 @@ def sensitive_ext_extract(basename): "done", # mount within Slurm worker container - "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true", + "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", # # mount on host (so that task dockers can access it) # "if [[ -f /.dockerenv ]]; then", # "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true", diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 6263c78..6c9999d 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -96,7 +96,12 @@ fi done echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code -elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # this is a special exit code that localization.sh can explicitly return +# these are special exit codes that localization.sh can explicitly return +elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job + echo "INFO: localization will be retried" >&2 + scontrol requeue $SLURM_JOB_ID + scontrol update $SLURM_JOB_ID starttime=now+10m +elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization can be skipped (e.g. localization disk already exists) echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2 export CANINE_JOB_RC=0 echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code From e92261a8f21125acd836796d22f6617203e422ac Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 Oct 2023 18:51:01 +0000 Subject: [PATCH 02/19] Remove unused code --- canine/localization/base.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index a5218d7..1b2dbec 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1314,10 +1314,6 @@ def sensitive_ext_extract(basename): # mount within Slurm worker container "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", -# # mount on host (so that task dockers can access it) -# "if [[ -f /.dockerenv ]]; then", -# "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true", -# "fi", "fi", # because we forced zero exits for the previous commands, @@ -1429,10 +1425,6 @@ def sensitive_ext_extract(basename): ' CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}', ' CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}', ' if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then', -# # unmount on container host too -# ' if [[ -f /.dockerenv ]]; then', -# ' sudo nsenter -t 1 -m umount ${CANINE_RODISK_DIR} || true', -# ' fi', ' gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2', ' else', ' echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2', From bdfad081b500ae87b1309acc5e18a2ca23808143 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 Oct 2023 19:00:01 +0000 Subject: [PATCH 03/19] Output all RODISK mount messages to stderr * No more DIAG_FILE * Clarify some errors messages --- canine/localization/base.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 1b2dbec..4bc087c 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1278,16 +1278,13 @@ def sensitive_ext_extract(basename): "sudo mkdir -p ${CANINE_RODISK_DIR}", "fi", - # create tempfile to hold diagnostic information - "DIAG_FILE=$(mktemp)", - # attach the disk if it's not already "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then", # we can run into a race condition here if other tasks are # attempting to mount the same disk simultaneously, so we # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded), # which we explicitly propagate to cause localization to be retried - "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", + "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro || { [ $? == 5 ] && exit 5 || true; }", "fi", # mount the disk if it's not already @@ -1306,22 +1303,22 @@ def sensitive_ext_extract(basename): 'exit 1', 'fi', # otherwise, it didn't attach for some other reason - 'echo "Timeout exceeded for disk to attach; perhaps the stderr of \`gcloud compute instances attach disk\` might contain insight:" >&2; cat $DIAG_FILE >&2', + 'echo "Timeout exceeded for disk to attach" >&2', 'exit 1', 'fi', "sleep 10; ((++tries))", "done", # mount within Slurm worker container - "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", + "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || { [ $? == 5 ] && exit 5 || true; }", "fi", # because we forced zero exits for the previous commands, # we need to verify that the mount actually exists - "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; cat $DIAG_FILE >&2; exit 1; }", + "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }", # also verify that the filesystem is OK - "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk filesystem is bad!' >&2; cat $DIAG_FILE >&2; exit 1; }", + "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 1; }", # lock the disk; will be unlocked during teardown script (or if script crashes) # this is to ensure that we don't unmount the disk during teardown From 565b57fb5265da7b5f792517aa6c498cf8177b5c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 3 Oct 2023 15:57:35 +0000 Subject: [PATCH 04/19] Clarify comments --- canine/localization/base.py | 2 +- canine/orchestrator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 4bc087c..408d129 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -841,7 +841,7 @@ def create_persistent_disk(self, '[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :', '((++TRIES))', 'done', - 'exit 1 #DEBUG_OMIT', # fail localizer -> requeue task -> job avoid + 'exit 1 #DEBUG_OMIT', # fail localizer -> retry task via Prefect -> job avoid. TODO: should this be exit code 5 (fail localizer -> auto-requeue via slurm?) 'fi', 'fi', # TODO: what if the task exited on the other diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 6c9999d..f3e595a 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -101,7 +101,7 @@ echo "INFO: localization will be retried" >&2 scontrol requeue $SLURM_JOB_ID scontrol update $SLURM_JOB_ID starttime=now+10m -elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization can be skipped (e.g. localization disk already exists) +elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks) echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2 export CANINE_JOB_RC=0 echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code From 8d16003d52c7fa8203fd7367f754fee8ae77757e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 3 Oct 2023 15:57:47 +0000 Subject: [PATCH 05/19] This command won't run, since job requeue will quit the script --- canine/orchestrator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/canine/orchestrator.py b/canine/orchestrator.py index f3e595a..7bd514e 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -100,7 +100,6 @@ elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job echo "INFO: localization will be retried" >&2 scontrol requeue $SLURM_JOB_ID - scontrol update $SLURM_JOB_ID starttime=now+10m elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks) echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2 export CANINE_JOB_RC=0 From 940aeb292e04a4795ba68fd5a3c370ecaaef784b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 3 Oct 2023 16:03:50 +0000 Subject: [PATCH 06/19] No need to have an exit 5 here, since that code is reserved for gcloud exceeding quota --- canine/localization/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 408d129..921ea9f 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1310,7 +1310,7 @@ def sensitive_ext_extract(basename): "done", # mount within Slurm worker container - "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || { [ $? == 5 ] && exit 5 || true; }", + "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || true", "fi", # because we forced zero exits for the previous commands, From 3029c3780cc4cfaa2ee472e80983b018f685c01c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 3 Oct 2023 21:46:31 +0000 Subject: [PATCH 07/19] Make status messages more friendly --- canine/localization/base.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 921ea9f..101752b 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -802,6 +802,8 @@ def create_persistent_disk(self, 'GCP_DISK_SIZE={}'.format(disk_size), 'GCP_TSNT_DISKS_DIR={}'.format(mount_prefix), + 'echo "Saving outputs to scratch disk ${GCP_DISK_NAME}" >&2' if is_scratch_disk else 'echo "Localizing inputs to cache disk ${GCP_DISK_NAME} (${GCP_DISK_SIZE}GB)" >&2', + ## create disk 'if ! gcloud compute disks describe "${GCP_DISK_NAME}" --zone ${CANINE_NODE_ZONE}; then', 'gcloud compute disks create "${GCP_DISK_NAME}" --size "${GCP_DISK_SIZE}GB" --type pd-standard --zone "${CANINE_NODE_ZONE}" --labels wolf=canine', @@ -1274,6 +1276,8 @@ def sensitive_ext_extract(basename): "CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}", "CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}", + 'echo "Attaching read-only disk ${CANINE_RODISK} ..." >&2', + "if [[ ! -d ${CANINE_RODISK_DIR} ]]; then", "sudo mkdir -p ${CANINE_RODISK_DIR}", "fi", @@ -1284,7 +1288,7 @@ def sensitive_ext_extract(basename): # attempting to mount the same disk simultaneously, so we # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded), # which we explicitly propagate to cause localization to be retried - "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro || { [ $? == 5 ] && exit 5 || true; }", + "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &> /dev/null || { [ $? == 5 ] && exit 5 || true; }", "fi", # mount the disk if it's not already @@ -1303,7 +1307,7 @@ def sensitive_ext_extract(basename): 'exit 1', 'fi', # otherwise, it didn't attach for some other reason - 'echo "Timeout exceeded for disk to attach" >&2', + 'echo "Read-only disk did not successfully attach!" >&2', 'exit 1', 'fi', "sleep 10; ((++tries))", @@ -1325,6 +1329,8 @@ def sensitive_ext_extract(basename): # if other processes are still using it "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids", + 'echo "Successfully attached read-only disk ${CANINE_RODISK} ..." >&2', + "done", ] @@ -1421,10 +1427,11 @@ def sensitive_ext_extract(basename): ' CANINE_RODISK=${!CANINE_RODISK}', ' CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}', ' CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}', + ' echo "Unmounting read-only disk ${CANINE_RODISK}" >&2', ' if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then', - ' gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2', + ' gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2', ' else', - ' echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2', + ' echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2', ' fi', 'done)' ] + ( disk_teardown_script if self.localize_to_persistent_disk else [] ) From ac9cef290e350dc6c0248416cc69e35e80484eb3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 3 Oct 2023 22:36:37 +0000 Subject: [PATCH 08/19] This is a recoverable error --- canine/localization/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 101752b..b156dc4 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1322,7 +1322,7 @@ def sensitive_ext_extract(basename): "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }", # also verify that the filesystem is OK - "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 1; }", + "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 5; }", # lock the disk; will be unlocked during teardown script (or if script crashes) # this is to ensure that we don't unmount the disk during teardown From 1710fc4c82510377c13a9025fdc5f1bf9e60ba1e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 4 Oct 2023 00:04:22 +0000 Subject: [PATCH 09/19] Ellipsis -> period --- canine/localization/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index b156dc4..6c2c443 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1329,7 +1329,7 @@ def sensitive_ext_extract(basename): # if other processes are still using it "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids", - 'echo "Successfully attached read-only disk ${CANINE_RODISK} ..." >&2', + 'echo "Successfully attached read-only disk ${CANINE_RODISK}." >&2', "done", ] From 2bf0429ab87f1ebe48bc5283032cdf8d2c500696 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 13 Oct 2023 15:51:31 +0000 Subject: [PATCH 10/19] Update failure counts * Recoverable localization failure now updates failure count (so that relocalization retries don't get counted as preemptions) * Update failure count as soon as task fails (so that OOM restarts don't get counted as preemptions) --- canine/orchestrator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 7bd514e..5646a0d 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -79,6 +79,7 @@ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo -e "!!!! JOB FAILED! (EXIT CODE !!!!\e[29G$CANINE_JOB_RC)" >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 + echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count echo '++++ STARTING JOB CLEANUP ++++' >&2 $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2 TEARDOWN_RC=$? @@ -91,7 +92,6 @@ echo "INFO: Retrying job (attempt $((${{{{SLURM_RESTART_COUNT:-0}}}}+1))/$CANINE_RETRY_LIMIT)" >&2 [ -f $CANINE_JOB_ROOT/stdout ] && mv $CANINE_JOB_ROOT/stdout $CANINE_JOB_ROOT/stdout_${{{{SLURM_RESTART_COUNT:-0}}}} || : [ -f $CANINE_JOB_ROOT/stderr ] && mv $CANINE_JOB_ROOT/stderr $CANINE_JOB_ROOT/stderr_${{{{SLURM_RESTART_COUNT:-0}}}} || : - echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count scontrol requeue $SLURM_JOB_ID fi done @@ -99,6 +99,7 @@ # these are special exit codes that localization.sh can explicitly return elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job echo "INFO: localization will be retried" >&2 + echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count scontrol requeue $SLURM_JOB_ID elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks) echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2 From 8962bb37cfaaedc921bff3db4af37ebf99d0d985 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 13 Oct 2023 16:01:02 +0000 Subject: [PATCH 11/19] Print more informative messages when mounting RODISK --- canine/localization/base.py | 17 ++++++++++------- canine/orchestrator.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 6c2c443..0959708 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1276,19 +1276,22 @@ def sensitive_ext_extract(basename): "CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}", "CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}", - 'echo "Attaching read-only disk ${CANINE_RODISK} ..." >&2', + 'echo "INFO: Attaching read-only disk ${CANINE_RODISK} ..." >&2', "if [[ ! -d ${CANINE_RODISK_DIR} ]]; then", "sudo mkdir -p ${CANINE_RODISK_DIR}", "fi", + # create tempfile to hold diagnostic information + "DIAG_FILE=$(mktemp)", + # attach the disk if it's not already "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then", # we can run into a race condition here if other tasks are # attempting to mount the same disk simultaneously, so we # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded), # which we explicitly propagate to cause localization to be retried - "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &> /dev/null || { [ $? == 5 ] && exit 5 || true; }", + "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", "fi", # mount the disk if it's not already @@ -1303,11 +1306,11 @@ def sensitive_ext_extract(basename): # this means the node is likely bad 'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then', 'sudo touch /.fatal_disk_issue_sentinel', - 'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2', + 'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion." >&2', 'exit 1', 'fi', # otherwise, it didn't attach for some other reason - 'echo "Read-only disk did not successfully attach!" >&2', + 'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :', 'exit 1', 'fi', "sleep 10; ((++tries))", @@ -1319,17 +1322,17 @@ def sensitive_ext_extract(basename): # because we forced zero exits for the previous commands, # we need to verify that the mount actually exists - "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }", + 'mountpoint -q ${CANINE_RODISK_DIR} || { echo "ERROR: Read-only disk mount failed!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :; exit 1; }', # also verify that the filesystem is OK - "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 5; }", + "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'WARNING: Read-only disk did not properly mount on this node; retrying.' >&2; exit 5; }", # lock the disk; will be unlocked during teardown script (or if script crashes) # this is to ensure that we don't unmount the disk during teardown # if other processes are still using it "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids", - 'echo "Successfully attached read-only disk ${CANINE_RODISK}." >&2', + 'echo "INFO: Successfully attached read-only disk ${CANINE_RODISK}." >&2', "done", ] diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 5646a0d..2c4a75d 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -98,7 +98,7 @@ echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code # these are special exit codes that localization.sh can explicitly return elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job - echo "INFO: localization will be retried" >&2 + echo "WARNING: localization will be retried" >&2 echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count scontrol requeue $SLURM_JOB_ID elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks) From cac429ba34ea80647c9d5865c25e8cd56a4db3ee Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 Oct 2023 16:53:11 +0000 Subject: [PATCH 12/19] Explicitly raise warning for job submission failure due to too high resource request --- canine/backends/base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/canine/backends/base.py b/canine/backends/base.py index 310e0f8..cf93e14 100644 --- a/canine/backends/base.py +++ b/canine/backends/base.py @@ -400,7 +400,15 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st command ) status, stdout, stderr = self.invoke(command) - check_call(command, status, stdout, stderr) + + if status != 0: + # explicitly catch error if user requested impossibly high resources + if stderr is not None and "Requested node configuration is not available" in stderr.read().decode(): + canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json") + raise ResourceWarning + # all other errors are handled generically + check_call(command, status, stdout, stderr) + out = stdout.read().decode() err = stderr.read().decode() result = batch_job_pattern.search(out) From 68a92aae5a00fbfbb34c6f9afe0ba2c650b1a733 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 26 Oct 2023 18:55:09 +0000 Subject: [PATCH 13/19] Retry additional localization failures * If node cannot attach disks because it's bad * If disk is being created by another node --- canine/localization/base.py | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 0959708..317cec1 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -822,29 +822,8 @@ def create_persistent_disk(self, 'while [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; do', ## check if disk is being created by _another_ instance (grep -qv $CANINE_NODE_NAME) 'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -qv "$CANINE_NODE_NAME"\'$\'; then', - # if disk is a localization disk (i.e. not a scratch disk), wait approximately how long it would take to transfer files to it - 'if ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "scratch=yes"; then', - 'TRIES=0', - # wait until disk is marked "finished" - 'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do', - 'echo "Waiting for localization disk to become available ..." >&2', - '[ $TRIES == 0 ] && sleep {} || sleep 300'.format(int(disk_size/0.1)), # assume 100 MB/sec transfer for first timeout; 5 minutes thereafter up to 10 times - '[ $TRIES -ge 10 ] && { echo "Exceeded timeout waiting for disk to become available" >&2; exit 1; } || :', - '((++TRIES))', - 'done', - 'exit 15 #DEBUG_OMIT', # special exit code to cause the script to be skipped in entrypoint.sh - # if disk is a scratch disk, wait up to two hours for it to finish. once it's finished, fail the localizer, to cause task to be requeued and avoided. - 'else', - 'TRIES=0', - # wait until disk is marked "finished" - 'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do', - 'echo "Waiting for scratch disk to become available ..." >&2', - 'sleep 60', - '[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :', - '((++TRIES))', - 'done', - 'exit 1 #DEBUG_OMIT', # fail localizer -> retry task via Prefect -> job avoid. TODO: should this be exit code 5 (fail localizer -> auto-requeue via slurm?) - 'fi', + 'echo "ERROR: Disk being created by another instances. Will retry when finished." >&2', + 'exit 5', # cause task to be requeued 'fi', # TODO: what if the task exited on the other # instance without running the teardown script @@ -853,7 +832,7 @@ def create_persistent_disk(self, # are there any scenarios in which this would be a bad idea? ## if disk is not being created by another instance, it might just be taking a bit to attach. give it a chance to appear in /dev - '[ $DELAY -gt 128 ] && { echo "Exceeded timeout trying to attach disk" >&2; exit 1; } || :', + '[ $DELAY -gt 128 ] && { echo "ERROR: Exceeded timeout trying to attach disk" >&2; exit 5; } || :', 'sleep $DELAY; ((DELAY *= 2))', # try attaching again if delay has exceeded 8 seconds @@ -863,8 +842,8 @@ def create_persistent_disk(self, 'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || :', 'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\' && [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; then', 'sudo touch /.fatal_disk_issue_sentinel', - 'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2', - 'exit 1', + 'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry localization on another node." >&2', + 'exit 5', 'fi', 'fi', 'done', @@ -1306,8 +1285,8 @@ def sensitive_ext_extract(basename): # this means the node is likely bad 'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then', 'sudo touch /.fatal_disk_issue_sentinel', - 'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion." >&2', - 'exit 1', + 'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry on another node." >&2', + 'exit 5', # retry on another node 'fi', # otherwise, it didn't attach for some other reason 'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :', From a84b85f61f8a6f96a20090ff8de6c7e91da86768 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 5 Dec 2023 21:39:01 +0000 Subject: [PATCH 14/19] Distinguish between job failure count and localization failure count --- canine/orchestrator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 2c4a75d..af959c1 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -53,7 +53,7 @@ source $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/setup.sh rm -f $CANINE_JOB_ROOT/.*exit_code || : echo 'COMPLETE ----' >&2 -if [ $((${{{{SLURM_RESTART_COUNT:-0}}}}-$([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0))) -ge $CANINE_PREEMPT_LIMIT ]; then +if [ $((${{{{SLURM_RESTART_COUNT:-0}}}}-$([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0))) -ge $CANINE_PREEMPT_LIMIT ]; then echo "Preemption limit exceeded; requeueing on non-preemptible nodes" >&2 exit 123 # special exit code indicating excessive preemption fi @@ -79,7 +79,7 @@ echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 echo -e "!!!! JOB FAILED! (EXIT CODE !!!!\e[29G$CANINE_JOB_RC)" >&2 echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2 - echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count + echo $(($([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.job_failure_count echo '++++ STARTING JOB CLEANUP ++++' >&2 $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2 TEARDOWN_RC=$? @@ -99,7 +99,7 @@ # these are special exit codes that localization.sh can explicitly return elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job echo "WARNING: localization will be retried" >&2 - echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count + echo $(($([ -f $CANINE_JOB_ROOT/.localization_failure_count ] && cat $CANINE_JOB_ROOT/.localization_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.localization_failure_count scontrol requeue $SLURM_JOB_ID elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks) echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2 From 70c2c943d225ab13cfe12b8feb36ab7f47c40b3c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Dec 2023 21:29:27 +0000 Subject: [PATCH 15/19] Avoid circular dependency if when installing slurm_gcp_docker now requires that wolF be installed in the image, in order to execute wolF scripts on worker nodes. This prevents Canine from trying to install slurm_gcp_docker if it detects that it's being built in the slurm_gcp_docker container. --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a23b579..545d2dc 100644 --- a/setup.py +++ b/setup.py @@ -57,9 +57,9 @@ 'tables>=3.6.1', 'google-crc32c>=1.5.0', 'google-cloud-compute>=1.6.1', - #'slurm_gcp_docker>=0.12', - 'slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0', - ], + ] + (['slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0'] + if not os.path.exists("/.dockerenv") else [] + ), # avoid circular dependency of slurm_gcp_docker -> wolf -> canine, python_requires = ">3.7", classifiers = [ "Development Status :: 4 - Beta", From 21fe533f1914d625862c4b1e2d5725179738ef75 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Dec 2023 12:52:36 +0000 Subject: [PATCH 16/19] More robust check on whether USER is set in environment If $USER is not set, this will cause canine to not be able to import, since os.environ["USER"] is queried during package import due to being in the constructor default arguments. We need to move it to the function body to avoid it being called. --- canine/backends/dockerTransient.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/canine/backends/dockerTransient.py b/canine/backends/dockerTransient.py index 8d8a409..140b2e1 100644 --- a/canine/backends/dockerTransient.py +++ b/canine/backends/dockerTransient.py @@ -37,11 +37,13 @@ def __init__( image_project = "broad-getzlab-workflows", image = None, storage_namespace = "workspace", storage_bucket = None, storage_disk = None, storage_disk_size = "100", - clust_frac = 1.0, user = os.environ["USER"], shutdown_on_exit = False, **kwargs + clust_frac = 1.0, user = None, shutdown_on_exit = False, **kwargs ): if user is None: - # IE: USER was not set - raise ValueError("USER not set in environment. Must explicitly pass user argument") + if "USER" in os.environ: + user = os.environ["USER"] + else: + raise ValueError("$USER not set in environment. Must explicitly pass user argument") if storage_bucket is not None and storage_disk is not None: canine_logging.warning("You specified both a persistent disk and cloud bucket to store workflow outputs; will only store to bucket!") From 38100c055f8002ed549184e9667800eeb84a0320 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 15 Dec 2023 16:05:09 +0000 Subject: [PATCH 17/19] Rewind stderr stream so that it can be printed to console --- canine/backends/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/canine/backends/base.py b/canine/backends/base.py index cf93e14..119ed34 100644 --- a/canine/backends/base.py +++ b/canine/backends/base.py @@ -403,7 +403,9 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st if status != 0: # explicitly catch error if user requested impossibly high resources - if stderr is not None and "Requested node configuration is not available" in stderr.read().decode(): + stderr_str = stderr.read().decode() + stderr.seek(0) + if stderr is not None and "Requested node configuration is not available" in stderr_str: canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json") raise ResourceWarning # all other errors are handled generically From 649ed84ee7b717af46941d2cd522206831adcaf3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 15 Dec 2023 17:35:57 +0000 Subject: [PATCH 18/19] Bump version --- canine/orchestrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canine/orchestrator.py b/canine/orchestrator.py index 6263c78..270b318 100644 --- a/canine/orchestrator.py +++ b/canine/orchestrator.py @@ -14,7 +14,7 @@ import numpy as np import pandas as pd from agutil import status_bar -version = '0.15.1' +version = '0.15.4' ADAPTERS = { 'Manual': ManualAdapter, From babe4ece1f9a50d3a9d7e4f515b5789942c54790 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sat, 16 Dec 2023 14:06:12 +0000 Subject: [PATCH 19/19] Add timeouts to attaching/detaching rodisks --- canine/localization/base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/canine/localization/base.py b/canine/localization/base.py index 0959708..9ee87aa 100644 --- a/canine/localization/base.py +++ b/canine/localization/base.py @@ -1290,8 +1290,10 @@ def sensitive_ext_extract(basename): # we can run into a race condition here if other tasks are # attempting to mount the same disk simultaneously, so we # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded), - # which we explicitly propagate to cause localization to be retried - "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }", + # which we explicitly propagate to cause localization to be retried. + # mounting the disk can also hang (exit 124), in which case we + # also cause localization to be retried by returning exit 5 + "timeout -k 30 30 gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { ec=$?; [[ $ec == 5 || $ec == 124 ]] && exit 5 || true; }", "fi", # mount the disk if it's not already @@ -1432,7 +1434,7 @@ def sensitive_ext_extract(basename): ' CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}', ' echo "Unmounting read-only disk ${CANINE_RODISK}" >&2', ' if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then', - ' gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2', + ' timeout -k 30 30 gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2', ' else', ' echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2', ' fi',