Skip to content

Commit

Permalink
Merge branch 'master' into preempt_retry_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
julianhess authored Jan 11, 2024
2 parents edf6c9d + 21c97e1 commit 5eedbf5
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 55 deletions.
12 changes: 11 additions & 1 deletion canine/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,17 @@ def sbatch(self, command: str, *slurmopts: str, **slurmparams: typing.Any) -> st
command
)
status, stdout, stderr = self.invoke(command)
check_call(command, status, stdout, stderr)

if status != 0:
# explicitly catch error if user requested impossibly high resources
stderr_str = stderr.read().decode()
stderr.seek(0)
if stderr is not None and "Requested node configuration is not available" in stderr_str:
canine_logging.error(f"Requested CPU/memory resources exceed maximum cluster capacity. Please request fewer resources, or add a suitable node to /mnt/nfs/clust_conf/slurm/nodetypes.json")
raise ResourceWarning
# all other errors are handled generically
check_call(command, status, stdout, stderr)

out = stdout.read().decode()
err = stderr.read().decode()
result = batch_job_pattern.search(out)
Expand Down
8 changes: 5 additions & 3 deletions canine/backends/dockerTransient.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@ def __init__(
image_project = "broad-getzlab-workflows",
image = None,
storage_namespace = "workspace", storage_bucket = None, storage_disk = None, storage_disk_size = "100",
clust_frac = 1.0, user = os.environ["USER"], shutdown_on_exit = False, **kwargs
clust_frac = 1.0, user = None, shutdown_on_exit = False, **kwargs
):
if user is None:
# IE: USER was not set
raise ValueError("USER not set in environment. Must explicitly pass user argument")
if "USER" in os.environ:
user = os.environ["USER"]
else:
raise ValueError("$USER not set in environment. Must explicitly pass user argument")

if storage_bucket is not None and storage_disk is not None:
canine_logging.warning("You specified both a persistent disk and cloud bucket to store workflow outputs; will only store to bucket!")
Expand Down
71 changes: 26 additions & 45 deletions canine/localization/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,8 @@ def create_persistent_disk(self,
'GCP_DISK_SIZE={}'.format(disk_size),
'GCP_TSNT_DISKS_DIR={}'.format(mount_prefix),

'echo "Saving outputs to scratch disk ${GCP_DISK_NAME}" >&2' if is_scratch_disk else 'echo "Localizing inputs to cache disk ${GCP_DISK_NAME} (${GCP_DISK_SIZE}GB)" >&2',

## create disk
'if ! gcloud compute disks describe "${GCP_DISK_NAME}" --zone ${CANINE_NODE_ZONE}; then',
'gcloud compute disks create "${GCP_DISK_NAME}" --size "${GCP_DISK_SIZE}GB" --type pd-standard --zone "${CANINE_NODE_ZONE}" --labels wolf=canine',
Expand All @@ -812,37 +814,16 @@ def create_persistent_disk(self,

## attach as read-write, using same device-name as disk-name
'if [[ ! -e /dev/disk/by-id/google-${GCP_DISK_NAME} ]]; then',
'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || true',
'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || { [ $? == 5 ] && exit 5 || true; }',
'fi',

## wait for disk to attach, with exponential backoff up to 2 minutes
'DELAY=1',
'while [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; do',
## check if disk is being created by _another_ instance (grep -qv $CANINE_NODE_NAME)
'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -qv "$CANINE_NODE_NAME"\'$\'; then',
# if disk is a localization disk (i.e. not a scratch disk), wait approximately how long it would take to transfer files to it
'if ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "scratch=yes"; then',
'TRIES=0',
# wait until disk is marked "finished"
'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
'echo "Waiting for localization disk to become available ..." >&2',
'[ $TRIES == 0 ] && sleep {} || sleep 300'.format(int(disk_size/0.1)), # assume 100 MB/sec transfer for first timeout; 5 minutes thereafter up to 10 times
'[ $TRIES -ge 10 ] && { echo "Exceeded timeout waiting for disk to become available" >&2; exit 1; } || :',
'((++TRIES))',
'done',
'exit 15 #DEBUG_OMIT', # special exit code to cause the script to be skipped in entrypoint.sh
# if disk is a scratch disk, wait up to two hours for it to finish. once it's finished, fail the localizer, to cause task to be requeued and avoided.
'else',
'TRIES=0',
# wait until disk is marked "finished"
'while ! gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(labels)" | grep -q "finished=yes"; do',
'echo "Waiting for scratch disk to become available ..." >&2',
'sleep 60',
'[ $TRIES -ge 120 ] && { echo "Exceeded timeout waiting for another node to finish making scratch disk" >&2; exit 1; } || :',
'((++TRIES))',
'done',
'exit 1 #DEBUG_OMIT', # fail localizer -> requeue task -> job avoid
'fi',
'echo "ERROR: Disk being created by another instances. Will retry when finished." >&2',
'exit 5', # cause task to be requeued
'fi',
# TODO: what if the task exited on the other
# instance without running the teardown script
Expand All @@ -851,7 +832,7 @@ def create_persistent_disk(self,
# are there any scenarios in which this would be a bad idea?

## if disk is not being created by another instance, it might just be taking a bit to attach. give it a chance to appear in /dev
'[ $DELAY -gt 128 ] && { echo "Exceeded timeout trying to attach disk" >&2; exit 1; } || :',
'[ $DELAY -gt 128 ] && { echo "ERROR: Exceeded timeout trying to attach disk" >&2; exit 5; } || :',
'sleep $DELAY; ((DELAY *= 2))',

# try attaching again if delay has exceeded 8 seconds
Expand All @@ -861,8 +842,8 @@ def create_persistent_disk(self,
'gcloud compute instances attach-disk "$CANINE_NODE_NAME" --zone "$CANINE_NODE_ZONE" --disk "$GCP_DISK_NAME" --device-name "$GCP_DISK_NAME" || :',
'if gcloud compute disks describe $GCP_DISK_NAME --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\' && [ ! -b /dev/disk/by-id/google-${GCP_DISK_NAME} ]; then',
'sudo touch /.fatal_disk_issue_sentinel',
'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
'exit 1',
'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry localization on another node." >&2',
'exit 5',
'fi',
'fi',
'done',
Expand Down Expand Up @@ -1274,6 +1255,8 @@ def sensitive_ext_extract(basename):
"CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}",
"CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}",

'echo "INFO: Attaching read-only disk ${CANINE_RODISK} ..." >&2',

"if [[ ! -d ${CANINE_RODISK_DIR} ]]; then",
"sudo mkdir -p ${CANINE_RODISK_DIR}",
"fi",
Expand All @@ -1285,8 +1268,11 @@ def sensitive_ext_extract(basename):
"if [[ ! -e /dev/disk/by-id/google-${CANINE_RODISK} ]]; then",
# we can run into a race condition here if other tasks are
# attempting to mount the same disk simultaneously, so we
# force a 0 exit
"gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || true",
# force a 0 exit, unless gcloud returned exit code 5 (quota exceeded),
# which we explicitly propagate to cause localization to be retried.
# mounting the disk can also hang (exit 124), in which case we
# also cause localization to be retried by returning exit 5
"timeout -k 30 30 gcloud compute instances attach-disk ${CANINE_NODE_NAME} --zone ${CANINE_NODE_ZONE} --disk ${CANINE_RODISK} --device-name ${CANINE_RODISK} --mode ro &>> $DIAG_FILE || { ec=$?; [[ $ec == 5 || $ec == 124 ]] && exit 5 || true; }",
"fi",

# mount the disk if it's not already
Expand All @@ -1301,36 +1287,34 @@ def sensitive_ext_extract(basename):
# this means the node is likely bad
'if [ ! -b /dev/disk/by-id/google-${CANINE_RODISK} ] && gcloud compute disks describe ${CANINE_RODISK} --zone $CANINE_NODE_ZONE --format "csv(users)[no-heading]" | grep \'^http\' | grep -q $CANINE_NODE_NAME\'$\'; then',
'sudo touch /.fatal_disk_issue_sentinel',
'echo "Node cannot attach disk; node is likely bad. Tagging for deletion." >&2',
'exit 1',
'echo "ERROR: Node cannot attach disk; node is likely bad. Tagging for deletion. Will retry on another node." >&2',
'exit 5', # retry on another node
'fi',
# otherwise, it didn't attach for some other reason
'echo "Timeout exceeded for disk to attach; perhaps the stderr of \`gcloud compute instances attach disk\` might contain insight:" >&2; cat $DIAG_FILE >&2',
'echo "ERROR: Read-only disk could not be attached!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :',
'exit 1',
'fi',
"sleep 10; ((++tries))",
"done",

# mount within Slurm worker container
"sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
# # mount on host (so that task dockers can access it)
# "if [[ -f /.dockerenv ]]; then",
# "sudo nsenter -t 1 -m mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} &>> $DIAG_FILE || true",
# "fi",
"sudo timeout -k 30 30 mount -o noload,ro,defaults /dev/disk/by-id/google-${CANINE_RODISK} ${CANINE_RODISK_DIR} || true",
"fi",

# because we forced zero exits for the previous commands,
# we need to verify that the mount actually exists
"mountpoint -q ${CANINE_RODISK_DIR} || { echo 'Read-only disk mount failed!' >&2; cat $DIAG_FILE >&2; exit 1; }",
'mountpoint -q ${CANINE_RODISK_DIR} || { echo "ERROR: Read-only disk mount failed!" >&2; [ -s $DIAG_FILE ] && { echo "The following error message may contain insight:" >&2; cat $DIAG_FILE >&2; } || :; exit 1; }',

# also verify that the filesystem is OK
"timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'Read-only disk filesystem is bad!' >&2; cat $DIAG_FILE >&2; exit 1; }",
"timeout -k 30 30 ls ${CANINE_RODISK_DIR} > /dev/null || { echo 'WARNING: Read-only disk did not properly mount on this node; retrying.' >&2; exit 5; }",

# lock the disk; will be unlocked during teardown script (or if script crashes)
# this is to ensure that we don't unmount the disk during teardown
# if other processes are still using it
"flock -os ${CANINE_RODISK_DIR} sleep infinity & echo $! >> ${CANINE_JOB_INPUTS}/.rodisk_lock_pids",

'echo "INFO: Successfully attached read-only disk ${CANINE_RODISK}." >&2',

"done",
]

Expand Down Expand Up @@ -1427,14 +1411,11 @@ def sensitive_ext_extract(basename):
' CANINE_RODISK=${!CANINE_RODISK}',
' CANINE_RODISK_DIR=CANINE_RODISK_DIR_${i}',
' CANINE_RODISK_DIR=${!CANINE_RODISK_DIR}',
' echo "Unmounting read-only disk ${CANINE_RODISK}" >&2',
' if flock -n ${CANINE_RODISK_DIR} true && mountpoint -q ${CANINE_RODISK_DIR} && sudo umount ${CANINE_RODISK_DIR}; then',
# # unmount on container host too
# ' if [[ -f /.dockerenv ]]; then',
# ' sudo nsenter -t 1 -m umount ${CANINE_RODISK_DIR} || true',
# ' fi',
' gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK || echo "Error detaching disk ${CANINE_RODISK}" >&2',
' timeout -k 30 30 gcloud compute instances detach-disk $CANINE_NODE_NAME --zone $CANINE_NODE_ZONE --disk $CANINE_RODISK && echo "Unmounted ${CANINE_RODISK}" >&2 || echo "Error detaching disk ${CANINE_RODISK}" >&2',
' else',
' echo "RODISK ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
' echo "Read-only disk ${CANINE_RODISK} is busy and will not be unmounted during teardown. It is likely in use by another job." >&2',
' fi',
'done)'
] + ( disk_teardown_script if self.localize_to_persistent_disk else [] )
Expand Down
11 changes: 8 additions & 3 deletions canine/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np
import pandas as pd
from agutil import status_bar
version = '0.15.1'
version = '0.15.4'

ADAPTERS = {
'Manual': ManualAdapter,
Expand Down Expand Up @@ -85,6 +85,7 @@
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo -e "!!!! JOB FAILED! (EXIT CODE !!!!\e[29G$CANINE_JOB_RC)" >&2
echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" >&2
echo $(($([ -f $CANINE_JOB_ROOT/.job_failure_count ] && cat $CANINE_JOB_ROOT/.job_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.job_failure_count
echo '++++ STARTING JOB CLEANUP ++++' >&2
$CANINE_JOBS/$SLURM_ARRAY_TASK_ID/teardown.sh >&2
TEARDOWN_RC=$?
Expand All @@ -97,12 +98,16 @@
echo "INFO: Retrying job (attempt $((${{{{SLURM_RESTART_COUNT:-0}}}}+1))/$CANINE_RETRY_LIMIT)" >&2
[ -f $CANINE_JOB_ROOT/stdout ] && mv $CANINE_JOB_ROOT/stdout $CANINE_JOB_ROOT/stdout_${{{{SLURM_RESTART_COUNT:-0}}}} || :
[ -f $CANINE_JOB_ROOT/stderr ] && mv $CANINE_JOB_ROOT/stderr $CANINE_JOB_ROOT/stderr_${{{{SLURM_RESTART_COUNT:-0}}}} || :
echo $(($([ -f $CANINE_JOB_ROOT/.failure_count ] && cat $CANINE_JOB_ROOT/.failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.failure_count
scontrol requeue $SLURM_JOB_ID
fi
done
echo -n $CANINE_JOB_RC > $CANINE_JOB_ROOT/.job_exit_code
elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # this is a special exit code that localization.sh can explicitly return
# these are special exit codes that localization.sh can explicitly return
elif [ $LOCALIZER_JOB_RC -eq 5 ]; then # localization failed due to recoverable reason (e.g. quota); requeue the job
echo "WARNING: localization will be retried" >&2
echo $(($([ -f $CANINE_JOB_ROOT/.localization_failure_count ] && cat $CANINE_JOB_ROOT/.localization_failure_count || echo -n 0)+1)) > $CANINE_JOB_ROOT/.localization_failure_count
scontrol requeue $SLURM_JOB_ID
elif [ $LOCALIZER_JOB_RC -eq 15 ]; then # localization and job can be skipped (to facilitate avoidance of scratch disk tasks)
echo '~~~~ LOCALIZATION SKIPPED ~~~~' >&2
export CANINE_JOB_RC=0
echo -n "DNR" > $CANINE_JOB_ROOT/.job_exit_code
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@
'tables>=3.6.1',
'google-crc32c>=1.5.0',
'google-cloud-compute>=1.6.1',
#'slurm_gcp_docker>=0.12',
'slurm_gcp_docker @ git+https://github.com/getzlab/[email protected]',
],
] + (['slurm_gcp_docker @ git+https://github.com/getzlab/slurm_gcp_docker@v0.16.0']
if not os.path.exists("/.dockerenv") else []
), # avoid circular dependency of slurm_gcp_docker -> wolf -> canine,
python_requires = ">3.7",
classifiers = [
"Development Status :: 4 - Beta",
Expand Down

0 comments on commit 5eedbf5

Please sign in to comment.