From c214ef4eaca655031beec8c4919ca46f85d09aa3 Mon Sep 17 00:00:00 2001 From: Nathan Painchaud Date: Fri, 22 Jul 2022 17:30:03 +0200 Subject: [PATCH] Support `submitit` Hydra launcher, to launch runs on SLURM clusters The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package --- .env.example | 6 ++++++ requirements/requirements.txt | 1 + vital/config/hydra/launcher/alliance.yaml | 19 +++++++++++++++++++ vital/config/hydra/launcher/beluga.yaml | 16 ++++++++++++++++ vital/config/vital_default.yaml | 1 + vital/runner.py | 15 +++++++++++++++ 6 files changed, 58 insertions(+) create mode 100644 vital/config/hydra/launcher/alliance.yaml create mode 100644 vital/config/hydra/launcher/beluga.yaml diff --git a/.env.example b/.env.example index 2eba6e135..ac8fbab98 100644 --- a/.env.example +++ b/.env.example @@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus" ### API keys ### COMET_API_KEY="" +### Alliance clusters config ### +ALLIANCE_VENV_PATH="path/to/project/virtualenv" + +### SLURM config ### +SLURM_MAIL_USER="" + ### Error Flags ### # HYDRA_FULL_ERROR=1 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e507fa78e..f8e8905c7 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -23,4 +23,5 @@ natsort nibabel albumentations hydra-core~=1.2.0 +hydra-submitit-launcher python-dotenv diff --git a/vital/config/hydra/launcher/alliance.yaml b/vital/config/hydra/launcher/alliance.yaml new file mode 100644 index 000000000..19c083ccc --- /dev/null +++ b/vital/config/hydra/launcher/alliance.yaml @@ -0,0 +1,19 @@ +# @package _global_ + +defaults: + - submitit_slurm@_group_ + +hydra: + launcher: + timeout_min: ${oc.select:run_time_min,60} + setup: + - "module load httpproxy" # load module allowing to connect to whitelisted domains + - "source $ALLIANCE_VENV_PATH/bin/activate" # activate the pre-installed virtual environment + - "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node + additional_parameters: + mail-user: ${oc.env:SLURM_MAIL_USER,null} + mail-type: ALL + +# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`) +# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the +# configs might not give priority to the launcher config. diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml new file mode 100644 index 000000000..6f9a603e7 --- /dev/null +++ b/vital/config/hydra/launcher/beluga.yaml @@ -0,0 +1,16 @@ +# @package _global_ + +defaults: + - alliance + +hydra: + launcher: + gpus_per_node: 1 + cpus_per_gpu: 10 + mem_per_gpu: "47750M" + +trainer: + enable_progress_bar: False + +data: + num_workers: 9 diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml index 68ef56529..806932691 100644 --- a/vital/config/vital_default.yaml +++ b/vital/config/vital_default.yaml @@ -2,6 +2,7 @@ defaults: - trainer: default - task: default # Incomplete config, but sets some common options - data: default # Incomplete config, but sets some common options + - hydra/launcher: default # List launcher after trainer/task/data, so that it can override their configs - callbacks: - model_checkpoint - logger: comet/online diff --git a/vital/runner.py b/vital/runner.py index 8bd1b7ba3..418da7a35 100644 --- a/vital/runner.py +++ b/vital/runner.py @@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig: with open_dict(cfg): cfg.trainer.default_root_dir = os.getcwd() + # When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node, + # and update the path if it was. Otherwise, we will use the path as-is. + if compute_node_dir := os.environ.get("SLURM_TMPDIR"): + dataset_name = Path(cfg.data.dataset_path).name + slurm_dataset_path = Path(compute_node_dir) / dataset_name + if slurm_dataset_path.exists(): + cfg.data.dataset_path = str(slurm_dataset_path) + else: + logger.warning( + f"Running in a distributed computing environment, but we could not locate the dataset on the node " + f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). " + f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so " + f"that we can detect it and use the local data for improved performance." + ) + return cfg @staticmethod