From 271df555df62ce0e4ee56c84c87e7eb4b537de17 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 Oct 2023 18:50:44 +0000
Subject: [PATCH 01/19] Add special exit 5 to cause localization to be retried

---
 canine/localization/base.py | 9 +++++----
 canine/orchestrator.py      | 7 ++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 40c88a3..a5218d7 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -812,7 +812,7 @@ def create_persistent_disk(self,
 
             ## attach as read-write, using same device-name as disk-name
             'if [[ ! -e /dev/disk/by-id/google-${GCP_DISK_NAME} ]]; then',
-            'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || true',
+            'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || { [ $? == 5 ] && exit 5 || true; }',
             'fi',
 
             ## wait for disk to attach, with exponential backoff up to 2 minutes
@@ -1285,8 +1285,9 @@ def sensitive_ext_extract(basename):
               "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then",
               # we can run into a race condition here if other tasks are
               # attempting to mount the same disk simultaneously, so we
-              # force a 0 exit
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || true",
+              # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
+              # which we explicitly propagate to cause localization to be retried
+              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1312,7 +1313,7 @@ def sensitive_ext_extract(basename):
               "done",
 
               # mount within Slurm worker container
-              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
+              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
 #              # mount on host (so that task dockers can access it)
 #              "if [[ -f /.dockerenv ]]; then",
 #              "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 6263c78..6c9999d 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -96,7 +96,12 @@
     fi
   done
   echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code
-elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # this is a special exit code that localization.sh can explicitly return
+# these are special exit codes that localization.sh can explicitly return
+elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
+  echo "INFO: localization will be retried" >&2
+  scontrol requeue $SLURM_JOB_ID
+  scontrol update $SLURM_JOB_ID starttime=now+10m
+elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization can be skipped (e.g. localization disk already exists)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2
   export CANINE_JOB_RC=0
   echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code

From e92261a8f21125acd836796d22f6617203e422ac Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 Oct 2023 18:51:01 +0000
Subject: [PATCH 02/19] Remove unused code

---
 canine/localization/base.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index a5218d7..1b2dbec 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1314,10 +1314,6 @@ def sensitive_ext_extract(basename):
 
               # mount within Slurm worker container
               "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
-#              # mount on host (so that task dockers can access it)
-#              "if [[ -f /.dockerenv ]]; then",
-#              "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
-#              "fi",
               "fi",
 
               # because we forced zero exits for the previous commands,
@@ -1429,10 +1425,6 @@ def sensitive_ext_extract(basename):
                 '  CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}',
                 '  CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}',
                 '  if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then',
-#                # unmount on container host too
-#                '    if [[ -f /.dockerenv ]]; then',
-#                '      sudo nsenter -t 1 -m umount ${CANINE_RODISK_DIR} || true',
-#                '    fi',
                 '    gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2',
                 '  else',
                 '    echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',

From bdfad081b500ae87b1309acc5e18a2ca23808143 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 Oct 2023 19:00:01 +0000
Subject: [PATCH 03/19] Output all RODISK mount messages to stderr

* No more DIAG_FILE
* Clarify some errors messages
---
 canine/localization/base.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 1b2dbec..4bc087c 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1278,16 +1278,13 @@ def sensitive_ext_extract(basename):
               "sudo mkdir -p ${CANINE_RODISK_DIR}",
               "fi",
 
-              # create tempfile to hold diagnostic information
-              "DIAG_FILE=$(mktemp)",
-
               # attach the disk if it's not already
               "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then",
               # we can run into a race condition here if other tasks are
               # attempting to mount the same disk simultaneously, so we
               # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
               # which we explicitly propagate to cause localization to be retried
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
+              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro || { [ $? == 5 ] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1306,22 +1303,22 @@ def sensitive_ext_extract(basename):
                     'exit 1',
                   'fi',
                   # otherwise, it didn't attach for some other reason
-                  'echo "Timeout exceeded for disk to attach; perhaps the stderr of \`gcloud compute instances attach disk\` might contain insight:" >&2; cat $DIAG_FILE >&2',
+                  'echo "Timeout exceeded for disk to attach" >&2',
                   'exit 1',
                 'fi',
                 "sleep 10; ((++tries))",
               "done",
 
               # mount within Slurm worker container
-              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
+              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || { [ $? == 5 ] && exit 5 || true; }",
               "fi",
 
               # because we forced zero exits for the previous commands,
               # we need to verify that the mount actually exists
-              "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; cat $DIAG_FILE >&2; exit 1; }",
+              "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }",
 
               # also verify that the filesystem is OK
-              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk filesystem is bad!' >&2; cat $DIAG_FILE >&2; exit 1; }",
+              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 1; }",
 
               # lock the disk; will be unlocked during teardown script (or if script crashes)
               # this is to ensure that we don't unmount the disk during teardown

From 565b57fb5265da7b5f792517aa6c498cf8177b5c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 3 Oct 2023 15:57:35 +0000
Subject: [PATCH 04/19] Clarify comments

---
 canine/localization/base.py | 2 +-
 canine/orchestrator.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 4bc087c..408d129 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -841,7 +841,7 @@ def create_persistent_disk(self,
                     '[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :',
                     '((++TRIES))',
                   'done',
-                  'exit 1 #DEBUG_OMIT', # fail localizer -> requeue task -> job avoid 
+                  'exit 1 #DEBUG_OMIT', # fail localizer -> retry task via Prefect -> job avoid. TODO: should this be exit code 5 (fail localizer -> auto-requeue via slurm?)
                 'fi',
               'fi',
               # TODO: what if the task exited on the other
diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 6c9999d..f3e595a 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -101,7 +101,7 @@
   echo "INFO: localization will be retried" >&2
   scontrol requeue $SLURM_JOB_ID
   scontrol update $SLURM_JOB_ID starttime=now+10m
-elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization can be skipped (e.g. localization disk already exists)
+elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2
   export CANINE_JOB_RC=0
   echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code

From 8d16003d52c7fa8203fd7367f754fee8ae77757e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 3 Oct 2023 15:57:47 +0000
Subject: [PATCH 05/19] This command won't run, since job requeue will quit the
 script

---
 canine/orchestrator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index f3e595a..7bd514e 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -100,7 +100,6 @@
 elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
   echo "INFO: localization will be retried" >&2
   scontrol requeue $SLURM_JOB_ID
-  scontrol update $SLURM_JOB_ID starttime=now+10m
 elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2
   export CANINE_JOB_RC=0

From 940aeb292e04a4795ba68fd5a3c370ecaaef784b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 3 Oct 2023 16:03:50 +0000
Subject: [PATCH 06/19] No need to have an exit 5 here, since that code is
 reserved for gcloud exceeding quota

---
 canine/localization/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 408d129..921ea9f 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1310,7 +1310,7 @@ def sensitive_ext_extract(basename):
               "done",
 
               # mount within Slurm worker container
-              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || { [ $? == 5 ] && exit 5 || true; }",
+              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || true",
               "fi",
 
               # because we forced zero exits for the previous commands,

From 3029c3780cc4cfaa2ee472e80983b018f685c01c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 3 Oct 2023 21:46:31 +0000
Subject: [PATCH 07/19] Make status messages more friendly

---
 canine/localization/base.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 921ea9f..101752b 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -802,6 +802,8 @@ def create_persistent_disk(self,
             'GCP_DISK_SIZE={}'.format(disk_size),
             'GCP_TSNT_DISKS_DIR={}'.format(mount_prefix),
 
+            'echo "Saving outputs to scratch disk ${GCP_DISK_NAME}" >&2' if is_scratch_disk else 'echo "Localizing inputs to cache disk ${GCP_DISK_NAME} (${GCP_DISK_SIZE}GB)" >&2',
+
             ## create disk
             'if ! gcloud compute disks describe "${GCP_DISK_NAME}" --zone ${CANINE_NODE_ZONE}; then',
             'gcloud compute disks create "${GCP_DISK_NAME}" --size "${GCP_DISK_SIZE}GB" --type pd-standard --zone "${CANINE_NODE_ZONE}" --labels wolf=canine',
@@ -1274,6 +1276,8 @@ def sensitive_ext_extract(basename):
               "CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}",
               "CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}",
 
+              'echo "Attaching read-only disk ${CANINE_RODISK} ..." >&2',
+
               "if [[ ! -d ${CANINE_RODISK_DIR} ]]; then",
               "sudo mkdir -p ${CANINE_RODISK_DIR}",
               "fi",
@@ -1284,7 +1288,7 @@ def sensitive_ext_extract(basename):
               # attempting to mount the same disk simultaneously, so we
               # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
               # which we explicitly propagate to cause localization to be retried
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro || { [ $? == 5 ] && exit 5 || true; }",
+              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &> /dev/null || { [ $? == 5 ] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1303,7 +1307,7 @@ def sensitive_ext_extract(basename):
                     'exit 1',
                   'fi',
                   # otherwise, it didn't attach for some other reason
-                  'echo "Timeout exceeded for disk to attach" >&2',
+                  'echo "Read-only disk did not successfully attach!" >&2',
                   'exit 1',
                 'fi',
                 "sleep 10; ((++tries))",
@@ -1325,6 +1329,8 @@ def sensitive_ext_extract(basename):
               # if other processes are still using it
               "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids",
 
+              'echo "Successfully attached read-only disk ${CANINE_RODISK} ..." >&2',
+
               "done",
             ]
 
@@ -1421,10 +1427,11 @@ def sensitive_ext_extract(basename):
                 '  CANINE_RODISK=${!CANINE_RODISK}',
                 '  CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}',
                 '  CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}',
+                '  echo "Unmounting read-only disk ${CANINE_RODISK}" >&2',
                 '  if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then',
-                '    gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2',
+                '    gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2',
                 '  else',
-                '    echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
+                '    echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
                 '  fi',
                 'done)'
             ] + ( disk_teardown_script if self.localize_to_persistent_disk else [] )

From ac9cef290e350dc6c0248416cc69e35e80484eb3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 3 Oct 2023 22:36:37 +0000
Subject: [PATCH 08/19] This is a recoverable error

---
 canine/localization/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 101752b..b156dc4 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1322,7 +1322,7 @@ def sensitive_ext_extract(basename):
               "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }",
 
               # also verify that the filesystem is OK
-              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 1; }",
+              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 5; }",
 
               # lock the disk; will be unlocked during teardown script (or if script crashes)
               # this is to ensure that we don't unmount the disk during teardown

From 1710fc4c82510377c13a9025fdc5f1bf9e60ba1e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 4 Oct 2023 00:04:22 +0000
Subject: [PATCH 09/19] Ellipsis -> period

---
 canine/localization/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index b156dc4..6c2c443 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1329,7 +1329,7 @@ def sensitive_ext_extract(basename):
               # if other processes are still using it
               "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids",
 
-              'echo "Successfully attached read-only disk ${CANINE_RODISK} ..." >&2',
+              'echo "Successfully attached read-only disk ${CANINE_RODISK}." >&2',
 
               "done",
             ]

From 2bf0429ab87f1ebe48bc5283032cdf8d2c500696 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 13 Oct 2023 15:51:31 +0000
Subject: [PATCH 10/19] Update failure counts

* Recoverable localization failure now updates failure count (so that relocalization
  retries don't get counted as preemptions)
* Update failure count as soon as task fails (so that OOM restarts don't get
  counted as preemptions)
---
 canine/orchestrator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 7bd514e..5646a0d 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -79,6 +79,7 @@
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
       echo -e "!!!! JOB FAILED! (EXIT CODE                 !!!!\e[29G$CANINE_JOB_RC)" >&2
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+      echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
       echo '++++ STARTING JOB CLEANUP ++++' >&2
       $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2
       TEARDOWN_RC=$?
@@ -91,7 +92,6 @@
       echo "INFO: Retrying job (attempt $((${{{{SLURM_RESTART_COUNT:-0}}}}+1))/$CANINE_RETRY_LIMIT)" >&2
       [ -f $CANINE_JOB_ROOT/stdout ] && mv $CANINE_JOB_ROOT/stdout $CANINE_JOB_ROOT/stdout_${{{{SLURM_RESTART_COUNT:-0}}}} || :
       [ -f $CANINE_JOB_ROOT/stderr ] && mv $CANINE_JOB_ROOT/stderr $CANINE_JOB_ROOT/stderr_${{{{SLURM_RESTART_COUNT:-0}}}} || :
-      echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
       scontrol requeue $SLURM_JOB_ID
     fi
   done
@@ -99,6 +99,7 @@
 # these are special exit codes that localization.sh can explicitly return
 elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
   echo "INFO: localization will be retried" >&2
+  echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
   scontrol requeue $SLURM_JOB_ID
 elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2

From 8962bb37cfaaedc921bff3db4af37ebf99d0d985 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 13 Oct 2023 16:01:02 +0000
Subject: [PATCH 11/19] Print more informative messages when mounting RODISK

---
 canine/localization/base.py | 17 ++++++++++-------
 canine/orchestrator.py      |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 6c2c443..0959708 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1276,19 +1276,22 @@ def sensitive_ext_extract(basename):
               "CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}",
               "CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}",
 
-              'echo "Attaching read-only disk ${CANINE_RODISK} ..." >&2',
+              'echo "INFO: Attaching read-only disk ${CANINE_RODISK} ..." >&2',
 
               "if [[ ! -d ${CANINE_RODISK_DIR} ]]; then",
               "sudo mkdir -p ${CANINE_RODISK_DIR}",
               "fi",
 
+              # create tempfile to hold diagnostic information
+              "DIAG_FILE=$(mktemp)",
+
               # attach the disk if it's not already
               "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then",
               # we can run into a race condition here if other tasks are
               # attempting to mount the same disk simultaneously, so we
               # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
               # which we explicitly propagate to cause localization to be retried
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &> /dev/null || { [ $? == 5 ] && exit 5 || true; }",
+              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1303,11 +1306,11 @@ def sensitive_ext_extract(basename):
                   # this means the node is likely bad
                   'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then',
                     'sudo touch /.fatal_disk_issue_sentinel',
-                    'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
+                    'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
                     'exit 1',
                   'fi',
                   # otherwise, it didn't attach for some other reason
-                  'echo "Read-only disk did not successfully attach!" >&2',
+                  'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :',
                   'exit 1',
                 'fi',
                 "sleep 10; ((++tries))",
@@ -1319,17 +1322,17 @@ def sensitive_ext_extract(basename):
 
               # because we forced zero exits for the previous commands,
               # we need to verify that the mount actually exists
-              "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; exit 1; }",
+              'mountpoint -q ${CANINE_RODISK_DIR} || { echo "ERROR: Read-only disk mount failed!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :; exit 1; }',
 
               # also verify that the filesystem is OK
-              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk mount is bad!' >&2; exit 5; }",
+              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'WARNING: Read-only disk did not properly mount on this node; retrying.' >&2; exit 5; }",
 
               # lock the disk; will be unlocked during teardown script (or if script crashes)
               # this is to ensure that we don't unmount the disk during teardown
               # if other processes are still using it
               "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids",
 
-              'echo "Successfully attached read-only disk ${CANINE_RODISK}." >&2',
+              'echo "INFO: Successfully attached read-only disk ${CANINE_RODISK}." >&2',
 
               "done",
             ]
diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 5646a0d..2c4a75d 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -98,7 +98,7 @@
   echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code
 # these are special exit codes that localization.sh can explicitly return
 elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
-  echo "INFO: localization will be retried" >&2
+  echo "WARNING: localization will be retried" >&2
   echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
   scontrol requeue $SLURM_JOB_ID
 elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)

From cac429ba34ea80647c9d5865c25e8cd56a4db3ee Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 Oct 2023 16:53:11 +0000
Subject: [PATCH 12/19] Explicitly raise warning for job submission failure due
 to too high resource request

---
 canine/backends/base.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/canine/backends/base.py b/canine/backends/base.py
index 310e0f8..cf93e14 100644
--- a/canine/backends/base.py
+++ b/canine/backends/base.py
@@ -400,7 +400,15 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st
             command
         )
         status, stdout, stderr = self.invoke(command)
-        check_call(command, status, stdout, stderr)
+
+        if status != 0:
+            # explicitly catch error if user requested impossibly high resources
+            if stderr is not None and "Requested node configuration is not available" in stderr.read().decode():
+                canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json")
+                raise ResourceWarning
+            # all other errors are handled generically
+            check_call(command, status, stdout, stderr)
+
         out = stdout.read().decode()
         err = stderr.read().decode()
         result = batch_job_pattern.search(out)

From 68a92aae5a00fbfbb34c6f9afe0ba2c650b1a733 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 26 Oct 2023 18:55:09 +0000
Subject: [PATCH 13/19] Retry additional localization failures

* If node cannot attach disks because it's bad
* If disk is being created by another node
---
 canine/localization/base.py | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 0959708..317cec1 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -822,29 +822,8 @@ def create_persistent_disk(self,
             'while [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; do',
               ## check if disk is being created by _another_ instance (grep -qv $CANINE_NODE_NAME)
               'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -qv "$CANINE_NODE_NAME"\'$\'; then',
-                # if disk is a localization disk (i.e. not a scratch disk), wait approximately how long it would take to transfer files to it
-                'if ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "scratch=yes"; then',
-                  'TRIES=0',
-                  # wait until disk is marked "finished"
-                  'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
-                    'echo "Waiting for localization disk to become available ..." >&2',
-                    '[ $TRIES == 0 ] && sleep {} || sleep 300'.format(int(disk_size/0.1)), # assume 100 MB/sec transfer for first timeout; 5 minutes thereafter up to 10 times
-                    '[ $TRIES -ge 10 ] && { echo "Exceeded timeout waiting for disk to become available" >&2; exit 1; } || :',
-                    '((++TRIES))',
-                  'done',
-                  'exit 15 #DEBUG_OMIT', # special exit code to cause the script to be skipped in entrypoint.sh
-                # if disk is a scratch disk, wait up to two hours for it to finish. once it's finished, fail the localizer, to cause task to be requeued and avoided.
-                'else',
-                  'TRIES=0',
-                  # wait until disk is marked "finished"
-                  'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
-                    'echo "Waiting for scratch disk to become available ..." >&2',
-                    'sleep 60',
-                    '[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :',
-                    '((++TRIES))',
-                  'done',
-                  'exit 1 #DEBUG_OMIT', # fail localizer -> retry task via Prefect -> job avoid. TODO: should this be exit code 5 (fail localizer -> auto-requeue via slurm?)
-                'fi',
+                'echo "ERROR: Disk being created by another instances. Will retry when finished." >&2',
+                'exit 5', # cause task to be requeued
               'fi',
               # TODO: what if the task exited on the other
               #       instance without running the teardown script
@@ -853,7 +832,7 @@ def create_persistent_disk(self,
               #       are there any scenarios in which this would be a bad idea?
 
               ## if disk is not being created by another instance, it might just be taking a bit to attach. give it a chance to appear in /dev
-              '[ $DELAY -gt 128 ] && { echo "Exceeded timeout trying to attach disk" >&2; exit 1; } || :',
+              '[ $DELAY -gt 128 ] && { echo "ERROR: Exceeded timeout trying to attach disk" >&2; exit 5; } || :',
               'sleep $DELAY; ((DELAY *= 2))',
 
               # try attaching again if delay has exceeded 8 seconds
@@ -863,8 +842,8 @@ def create_persistent_disk(self,
                 'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || :',
                 'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\' && [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; then',
                   'sudo touch /.fatal_disk_issue_sentinel',
-                  'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
-                  'exit 1',
+                  'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry localization on another node." >&2',
+                  'exit 5',
                 'fi',
               'fi',
             'done',
@@ -1306,8 +1285,8 @@ def sensitive_ext_extract(basename):
                   # this means the node is likely bad
                   'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then',
                     'sudo touch /.fatal_disk_issue_sentinel',
-                    'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
-                    'exit 1',
+                    'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry on another node." >&2',
+                    'exit 5', # retry on another node
                   'fi',
                   # otherwise, it didn't attach for some other reason
                   'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :',

From a84b85f61f8a6f96a20090ff8de6c7e91da86768 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 5 Dec 2023 21:39:01 +0000
Subject: [PATCH 14/19] Distinguish between job failure count and localization
 failure count

---
 canine/orchestrator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 2c4a75d..af959c1 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -53,7 +53,7 @@
 source $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/setup.sh
 rm -f $CANINE_JOB_ROOT/.*exit_code || :
 echo 'COMPLETE ----' >&2
-if [ $((${{{{SLURM_RESTART_COUNT:-0}}}}-$([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0))) -ge $CANINE_PREEMPT_LIMIT ]; then
+if [ $((${{{{SLURM_RESTART_COUNT:-0}}}}-$([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0))) -ge $CANINE_PREEMPT_LIMIT ]; then
   echo "Preemption limit exceeded; requeueing on non-preemptible nodes" >&2
   exit 123 # special exit code indicating excessive preemption
 fi
@@ -79,7 +79,7 @@
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
       echo -e "!!!! JOB FAILED! (EXIT CODE                 !!!!\e[29G$CANINE_JOB_RC)" >&2
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
-      echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
+      echo $(($([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.job_failure_count
       echo '++++ STARTING JOB CLEANUP ++++' >&2
       $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2
       TEARDOWN_RC=$?
@@ -99,7 +99,7 @@
 # these are special exit codes that localization.sh can explicitly return
 elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
   echo "WARNING: localization will be retried" >&2
-  echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
+  echo $(($([ -f $CANINE_JOB_ROOT/.localization_failure_count ] && cat $CANINE_JOB_ROOT/.localization_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.localization_failure_count
   scontrol requeue $SLURM_JOB_ID
 elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2

From 70c2c943d225ab13cfe12b8feb36ab7f47c40b3c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Dec 2023 21:29:27 +0000
Subject: [PATCH 15/19] Avoid circular dependency if when installing

slurm_gcp_docker now requires that wolF be installed in the image, in order
to execute wolF scripts on worker nodes. This prevents Canine from trying
to install slurm_gcp_docker if it detects that it's being built in the
slurm_gcp_docker container.
---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index a23b579..545d2dc 100644
--- a/setup.py
+++ b/setup.py
@@ -57,9 +57,9 @@
         'tables>=3.6.1',
         'google-crc32c>=1.5.0',
         'google-cloud-compute>=1.6.1',
-        #'slurm_gcp_docker>=0.12',
-        'slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0',
-    ],
+    ] + (['slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0']
+          if not os.path.exists("/.dockerenv") else []
+        ), # avoid circular dependency of slurm_gcp_docker -> wolf -> canine,
     python_requires = ">3.7",
     classifiers = [
         "Development Status :: 4 - Beta",

From 21fe533f1914d625862c4b1e2d5725179738ef75 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Dec 2023 12:52:36 +0000
Subject: [PATCH 16/19] More robust check on whether USER is set in environment

If $USER is not set, this will cause canine to not be able to import, since
os.environ["USER"] is queried during package import due to being in the
constructor default arguments. We need to move it to the function body to
avoid it being called.
---
 canine/backends/dockerTransient.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/canine/backends/dockerTransient.py b/canine/backends/dockerTransient.py
index 8d8a409..140b2e1 100644
--- a/canine/backends/dockerTransient.py
+++ b/canine/backends/dockerTransient.py
@@ -37,11 +37,13 @@ def __init__(
         image_project = "broad-getzlab-workflows",
         image = None,
         storage_namespace = "workspace", storage_bucket = None, storage_disk = None, storage_disk_size = "100",
-        clust_frac = 1.0, user = os.environ["USER"], shutdown_on_exit = False, **kwargs
+        clust_frac = 1.0, user = None, shutdown_on_exit = False, **kwargs
     ):
         if user is None:
-            # IE: USER was not set
-            raise ValueError("USER not set in environment. Must explicitly pass user argument")
+            if "USER" in os.environ:
+                user = os.environ["USER"]
+            else:
+                raise ValueError("$USER not set in environment. Must explicitly pass user argument")
 
         if storage_bucket is not None and storage_disk is not None:
             canine_logging.warning("You specified both a persistent disk and cloud bucket to store workflow outputs; will only store to bucket!")

From 38100c055f8002ed549184e9667800eeb84a0320 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 15 Dec 2023 16:05:09 +0000
Subject: [PATCH 17/19] Rewind stderr stream so that it can be printed to
 console

---
 canine/backends/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/canine/backends/base.py b/canine/backends/base.py
index cf93e14..119ed34 100644
--- a/canine/backends/base.py
+++ b/canine/backends/base.py
@@ -403,7 +403,9 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st
 
         if status != 0:
             # explicitly catch error if user requested impossibly high resources
-            if stderr is not None and "Requested node configuration is not available" in stderr.read().decode():
+            stderr_str = stderr.read().decode()
+            stderr.seek(0)
+            if stderr is not None and "Requested node configuration is not available" in stderr_str:
                 canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json")
                 raise ResourceWarning
             # all other errors are handled generically

From 649ed84ee7b717af46941d2cd522206831adcaf3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 15 Dec 2023 17:35:57 +0000
Subject: [PATCH 18/19] Bump version

---
 canine/orchestrator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/canine/orchestrator.py b/canine/orchestrator.py
index 6263c78..270b318 100644
--- a/canine/orchestrator.py
+++ b/canine/orchestrator.py
@@ -14,7 +14,7 @@
 import numpy as np
 import pandas as pd
 from agutil import status_bar
-version = '0.15.1'
+version = '0.15.4'
 
 ADAPTERS = {
     'Manual': ManualAdapter,

From babe4ece1f9a50d3a9d7e4f515b5789942c54790 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sat, 16 Dec 2023 14:06:12 +0000
Subject: [PATCH 19/19] Add timeouts to attaching/detaching rodisks

---
 canine/localization/base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/canine/localization/base.py b/canine/localization/base.py
index 0959708..9ee87aa 100644
--- a/canine/localization/base.py
+++ b/canine/localization/base.py
@@ -1290,8 +1290,10 @@ def sensitive_ext_extract(basename):
               # we can run into a race condition here if other tasks are
               # attempting to mount the same disk simultaneously, so we
               # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
-              # which we explicitly propagate to cause localization to be retried
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { [ $? == 5 ] && exit 5 || true; }",
+              # which we explicitly propagate to cause localization to be retried.
+              # mounting the disk can also hang (exit 124), in which case we
+              # also cause localization to be retried by returning exit 5
+              "timeout -k 30 30 gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { ec=$?; [[ $ec == 5 || $ec == 124 ]] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1432,7 +1434,7 @@ def sensitive_ext_extract(basename):
                 '  CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}',
                 '  echo "Unmounting read-only disk ${CANINE_RODISK}" >&2',
                 '  if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then',
-                '    gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2',
+                '    timeout -k 30 30 gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2',
                 '  else',
                 '    echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
                 '  fi',