Merge branch 'master' into preempt_retry_fix

getzlab · Jan 11, 2024 · 5eedbf5 · 5eedbf5
2 parents edf6c9d + 21c97e1
commit 5eedbf5
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 55 deletions.
diff --git a/canine/backends/base.py b/canine/backends/base.py
@@ -400,7 +400,17 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st
             command
         )
         status, stdout, stderr = self.invoke(command)
-        check_call(command, status, stdout, stderr)
+
+        if status != 0:
+            # explicitly catch error if user requested impossibly high resources
+            stderr_str = stderr.read().decode()
+            stderr.seek(0)
+            if stderr is not None and "Requested node configuration is not available" in stderr_str:
+                canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json")
+                raise ResourceWarning
+            # all other errors are handled generically
+            check_call(command, status, stdout, stderr)
+
         out = stdout.read().decode()
         err = stderr.read().decode()
         result = batch_job_pattern.search(out)

diff --git a/canine/backends/dockerTransient.py b/canine/backends/dockerTransient.py
@@ -37,11 +37,13 @@ def __init__(
         image_project = "broad-getzlab-workflows",
         image = None,
         storage_namespace = "workspace", storage_bucket = None, storage_disk = None, storage_disk_size = "100",
-        clust_frac = 1.0, user = os.environ["USER"], shutdown_on_exit = False, **kwargs
+        clust_frac = 1.0, user = None, shutdown_on_exit = False, **kwargs
     ):
         if user is None:
-            # IE: USER was not set
-            raise ValueError("USER not set in environment. Must explicitly pass user argument")
+            if "USER" in os.environ:
+                user = os.environ["USER"]
+            else:
+                raise ValueError("$USER not set in environment. Must explicitly pass user argument")
 
         if storage_bucket is not None and storage_disk is not None:
             canine_logging.warning("You specified both a persistent disk and cloud bucket to store workflow outputs; will only store to bucket!")

diff --git a/canine/localization/base.py b/canine/localization/base.py
@@ -802,6 +802,8 @@ def create_persistent_disk(self,
             'GCP_DISK_SIZE={}'.format(disk_size),
             'GCP_TSNT_DISKS_DIR={}'.format(mount_prefix),
 
+            'echo "Saving outputs to scratch disk ${GCP_DISK_NAME}" >&2' if is_scratch_disk else 'echo "Localizing inputs to cache disk ${GCP_DISK_NAME} (${GCP_DISK_SIZE}GB)" >&2',
+
             ## create disk
             'if ! gcloud compute disks describe "${GCP_DISK_NAME}" --zone ${CANINE_NODE_ZONE}; then',
             'gcloud compute disks create "${GCP_DISK_NAME}" --size "${GCP_DISK_SIZE}GB" --type pd-standard --zone "${CANINE_NODE_ZONE}" --labels wolf=canine',
@@ -812,37 +814,16 @@ def create_persistent_disk(self,
 
             ## attach as read-write, using same device-name as disk-name
             'if [[ ! -e /dev/disk/by-id/google-${GCP_DISK_NAME} ]]; then',
-            'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || true',
+            'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || { [ $? == 5 ] && exit 5 || true; }',
             'fi',
 
             ## wait for disk to attach, with exponential backoff up to 2 minutes
             'DELAY=1',
             'while [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; do',
               ## check if disk is being created by _another_ instance (grep -qv $CANINE_NODE_NAME)
               'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -qv "$CANINE_NODE_NAME"\'$\'; then',
-                # if disk is a localization disk (i.e. not a scratch disk), wait approximately how long it would take to transfer files to it
-                'if ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "scratch=yes"; then',
-                  'TRIES=0',
-                  # wait until disk is marked "finished"
-                  'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
-                    'echo "Waiting for localization disk to become available ..." >&2',
-                    '[ $TRIES == 0 ] && sleep {} || sleep 300'.format(int(disk_size/0.1)), # assume 100 MB/sec transfer for first timeout; 5 minutes thereafter up to 10 times
-                    '[ $TRIES -ge 10 ] && { echo "Exceeded timeout waiting for disk to become available" >&2; exit 1; } || :',
-                    '((++TRIES))',
-                  'done',
-                  'exit 15 #DEBUG_OMIT', # special exit code to cause the script to be skipped in entrypoint.sh
-                # if disk is a scratch disk, wait up to two hours for it to finish. once it's finished, fail the localizer, to cause task to be requeued and avoided.
-                'else',
-                  'TRIES=0',
-                  # wait until disk is marked "finished"
-                  'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
-                    'echo "Waiting for scratch disk to become available ..." >&2',
-                    'sleep 60',
-                    '[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :',
-                    '((++TRIES))',
-                  'done',
-                  'exit 1 #DEBUG_OMIT', # fail localizer -> requeue task -> job avoid 
-                'fi',
+                'echo "ERROR: Disk being created by another instances. Will retry when finished." >&2',
+                'exit 5', # cause task to be requeued
               'fi',
               # TODO: what if the task exited on the other
               #       instance without running the teardown script
@@ -851,7 +832,7 @@ def create_persistent_disk(self,
               #       are there any scenarios in which this would be a bad idea?
 
               ## if disk is not being created by another instance, it might just be taking a bit to attach. give it a chance to appear in /dev
-              '[ $DELAY -gt 128 ] && { echo "Exceeded timeout trying to attach disk" >&2; exit 1; } || :',
+              '[ $DELAY -gt 128 ] && { echo "ERROR: Exceeded timeout trying to attach disk" >&2; exit 5; } || :',
               'sleep $DELAY; ((DELAY *= 2))',
 
               # try attaching again if delay has exceeded 8 seconds
@@ -861,8 +842,8 @@ def create_persistent_disk(self,
                 'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || :',
                 'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\' && [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; then',
                   'sudo touch /.fatal_disk_issue_sentinel',
-                  'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
-                  'exit 1',
+                  'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry localization on another node." >&2',
+                  'exit 5',
                 'fi',
               'fi',
             'done',
@@ -1274,6 +1255,8 @@ def sensitive_ext_extract(basename):
               "CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}",
               "CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}",
 
+              'echo "INFO: Attaching read-only disk ${CANINE_RODISK} ..." >&2',
+
               "if [[ ! -d ${CANINE_RODISK_DIR} ]]; then",
               "sudo mkdir -p ${CANINE_RODISK_DIR}",
               "fi",
@@ -1285,8 +1268,11 @@ def sensitive_ext_extract(basename):
               "if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then",
               # we can run into a race condition here if other tasks are
               # attempting to mount the same disk simultaneously, so we
-              # force a 0 exit
-              "gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || true",
+              # force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
+              # which we explicitly propagate to cause localization to be retried.
+              # mounting the disk can also hang (exit 124), in which case we
+              # also cause localization to be retried by returning exit 5
+              "timeout -k 30 30 gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { ec=$?; [[ $ec == 5 || $ec == 124 ]] && exit 5 || true; }",
               "fi",
 
               # mount the disk if it's not already
@@ -1301,36 +1287,34 @@ def sensitive_ext_extract(basename):
                   # this means the node is likely bad
                   'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then',
                     'sudo touch /.fatal_disk_issue_sentinel',
-                    'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
-                    'exit 1',
+                    'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry on another node." >&2',
+                    'exit 5', # retry on another node
                   'fi',
                   # otherwise, it didn't attach for some other reason
-                  'echo "Timeout exceeded for disk to attach; perhaps the stderr of \`gcloud compute instances attach disk\` might contain insight:" >&2; cat $DIAG_FILE >&2',
+                  'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :',
                   'exit 1',
                 'fi',
                 "sleep 10; ((++tries))",
               "done",
 
               # mount within Slurm worker container
-              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
-#              # mount on host (so that task dockers can access it)
-#              "if [[ -f /.dockerenv ]]; then",
-#              "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
-#              "fi",
+              "sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || true",
               "fi",
 
               # because we forced zero exits for the previous commands,
               # we need to verify that the mount actually exists
-              "mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; cat $DIAG_FILE >&2; exit 1; }",
+              'mountpoint -q ${CANINE_RODISK_DIR} || { echo "ERROR: Read-only disk mount failed!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :; exit 1; }',
 
               # also verify that the filesystem is OK
-              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk filesystem is bad!' >&2; cat $DIAG_FILE >&2; exit 1; }",
+              "timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'WARNING: Read-only disk did not properly mount on this node; retrying.' >&2; exit 5; }",
 
               # lock the disk; will be unlocked during teardown script (or if script crashes)
               # this is to ensure that we don't unmount the disk during teardown
               # if other processes are still using it
               "flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids",
 
+              'echo "INFO: Successfully attached read-only disk ${CANINE_RODISK}." >&2',
+
               "done",
             ]
 
@@ -1427,14 +1411,11 @@ def sensitive_ext_extract(basename):
                 '  CANINE_RODISK=${!CANINE_RODISK}',
                 '  CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}',
                 '  CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}',
+                '  echo "Unmounting read-only disk ${CANINE_RODISK}" >&2',
                 '  if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then',
-#                # unmount on container host too
-#                '    if [[ -f /.dockerenv ]]; then',
-#                '      sudo nsenter -t 1 -m umount ${CANINE_RODISK_DIR} || true',
-#                '    fi',
-                '    gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2',
+                '    timeout -k 30 30 gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2',
                 '  else',
-                '    echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
+                '    echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
                 '  fi',
                 'done)'
             ] + ( disk_teardown_script if self.localize_to_persistent_disk else [] )

diff --git a/canine/orchestrator.py b/canine/orchestrator.py
@@ -14,7 +14,7 @@
 import numpy as np
 import pandas as pd
 from agutil import status_bar
-version = '0.15.1'
+version = '0.15.4'
 
 ADAPTERS = {
     'Manual': ManualAdapter,
@@ -85,6 +85,7 @@
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
       echo -e "!!!! JOB FAILED! (EXIT CODE                 !!!!\e[29G$CANINE_JOB_RC)" >&2
       echo    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
+      echo $(($([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.job_failure_count
       echo '++++ STARTING JOB CLEANUP ++++' >&2
       $CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2
       TEARDOWN_RC=$?
@@ -97,12 +98,16 @@
       echo "INFO: Retrying job (attempt $((${{{{SLURM_RESTART_COUNT:-0}}}}+1))/$CANINE_RETRY_LIMIT)" >&2
       [ -f $CANINE_JOB_ROOT/stdout ] && mv $CANINE_JOB_ROOT/stdout $CANINE_JOB_ROOT/stdout_${{{{SLURM_RESTART_COUNT:-0}}}} || :
       [ -f $CANINE_JOB_ROOT/stderr ] && mv $CANINE_JOB_ROOT/stderr $CANINE_JOB_ROOT/stderr_${{{{SLURM_RESTART_COUNT:-0}}}} || :
-      echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
       scontrol requeue $SLURM_JOB_ID
     fi
   done
   echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code
-elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # this is a special exit code that localization.sh can explicitly return
+# these are special exit codes that localization.sh can explicitly return
+elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
+  echo "WARNING: localization will be retried" >&2
+  echo $(($([ -f $CANINE_JOB_ROOT/.localization_failure_count ] && cat $CANINE_JOB_ROOT/.localization_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.localization_failure_count
+  scontrol requeue $SLURM_JOB_ID
+elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
   echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2
   export CANINE_JOB_RC=0
   echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code

diff --git a/setup.py b/setup.py
@@ -57,9 +57,9 @@
         'tables>=3.6.1',
         'google-crc32c>=1.5.0',
         'google-cloud-compute>=1.6.1',
-        #'slurm_gcp_docker>=0.12',
-        'slurm_gcp_docker @ git+https://github.com/getzlab/[email protected]',
-    ],
+    ] + (['slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0']
+          if not os.path.exists("/.dockerenv") else []
+        ), # avoid circular dependency of slurm_gcp_docker -> wolf -> canine,
     python_requires = ">3.7",
     classifiers = [
         "Development Status :: 4 - Beta",