From c214ef4eaca655031beec8c4919ca46f85d09aa3 Mon Sep 17 00:00:00 2001
From: Nathan Painchaud <nathan.painchaud@usherbrooke.ca>
Date: Fri, 22 Jul 2022 17:30:03 +0200
Subject: [PATCH] Support `submitit` Hydra launcher, to launch runs on SLURM
 clusters

The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package
---
 .env.example                              |  6 ++++++
 requirements/requirements.txt             |  1 +
 vital/config/hydra/launcher/alliance.yaml | 19 +++++++++++++++++++
 vital/config/hydra/launcher/beluga.yaml   | 16 ++++++++++++++++
 vital/config/vital_default.yaml           |  1 +
 vital/runner.py                           | 15 +++++++++++++++
 6 files changed, 58 insertions(+)
 create mode 100644 vital/config/hydra/launcher/alliance.yaml
 create mode 100644 vital/config/hydra/launcher/beluga.yaml

diff --git a/.env.example b/.env.example
index 2eba6e135..ac8fbab98 100644
--- a/.env.example
+++ b/.env.example
@@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus"
 ### API keys ###
 COMET_API_KEY="<your-comet-api-key>"
 
+### Alliance clusters config ###
+ALLIANCE_VENV_PATH="path/to/project/virtualenv"
+
+### SLURM config ###
+SLURM_MAIL_USER="<mail-address-to-notify>"
+
 ### Error Flags ###
 # HYDRA_FULL_ERROR=1
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index e507fa78e..f8e8905c7 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -23,4 +23,5 @@ natsort
 nibabel
 albumentations
 hydra-core~=1.2.0
+hydra-submitit-launcher
 python-dotenv
diff --git a/vital/config/hydra/launcher/alliance.yaml b/vital/config/hydra/launcher/alliance.yaml
new file mode 100644
index 000000000..19c083ccc
--- /dev/null
+++ b/vital/config/hydra/launcher/alliance.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - submitit_slurm@_group_
+
+hydra:
+  launcher:
+    timeout_min: ${oc.select:run_time_min,60}
+    setup:
+      - "module load httpproxy"  # load module allowing to connect to whitelisted domains
+      - "source $ALLIANCE_VENV_PATH/bin/activate" # activate the pre-installed virtual environment
+      - "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node
+    additional_parameters:
+      mail-user: ${oc.env:SLURM_MAIL_USER,null}
+      mail-type: ALL
+
+# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`)
+# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the
+# configs might not give priority to the launcher config.
diff --git a/vital/config/hydra/launcher/beluga.yaml b/vital/config/hydra/launcher/beluga.yaml
new file mode 100644
index 000000000..6f9a603e7
--- /dev/null
+++ b/vital/config/hydra/launcher/beluga.yaml
@@ -0,0 +1,16 @@
+# @package _global_
+
+defaults:
+  - alliance
+
+hydra:
+  launcher:
+    gpus_per_node: 1
+    cpus_per_gpu: 10
+    mem_per_gpu: "47750M"
+
+trainer:
+  enable_progress_bar: False
+
+data:
+  num_workers: 9
diff --git a/vital/config/vital_default.yaml b/vital/config/vital_default.yaml
index 68ef56529..806932691 100644
--- a/vital/config/vital_default.yaml
+++ b/vital/config/vital_default.yaml
@@ -2,6 +2,7 @@ defaults:
   - trainer: default
   - task: default  # Incomplete config, but sets some common options
   - data: default  # Incomplete config, but sets some common options
+  - hydra/launcher: default # List launcher after trainer/task/data, so that it can override their configs
   - callbacks:
       - model_checkpoint
   - logger: comet/online
diff --git a/vital/runner.py b/vital/runner.py
index 8bd1b7ba3..418da7a35 100644
--- a/vital/runner.py
+++ b/vital/runner.py
@@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
             with open_dict(cfg):
                 cfg.trainer.default_root_dir = os.getcwd()
 
+        # When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node,
+        # and update the path if it was. Otherwise, we will use the path as-is.
+        if compute_node_dir := os.environ.get("SLURM_TMPDIR"):
+            dataset_name = Path(cfg.data.dataset_path).name
+            slurm_dataset_path = Path(compute_node_dir) / dataset_name
+            if slurm_dataset_path.exists():
+                cfg.data.dataset_path = str(slurm_dataset_path)
+            else:
+                logger.warning(
+                    f"Running in a distributed computing environment, but we could not locate the dataset on the node "
+                    f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). "
+                    f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so "
+                    f"that we can detect it and use the local data for improved performance."
+                )
+
         return cfg
 
     @staticmethod