Skip to content

Commit

Permalink
Support submitit Hydra launcher, to launch runs on SLURM clusters
Browse files Browse the repository at this point in the history
The use of this launcher is meant in time to completely replace the `vital.utils.jobs` package
  • Loading branch information
nathanpainchaud committed Jul 25, 2022
1 parent d0a5a2a commit c214ef4
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,11 @@ CAMUS_DATA_PATH="path/to/camus"
### API keys ###
COMET_API_KEY="<your-comet-api-key>"

### Alliance clusters config ###
ALLIANCE_VENV_PATH="path/to/project/virtualenv"

### SLURM config ###
SLURM_MAIL_USER="<mail-address-to-notify>"

### Error Flags ###
# HYDRA_FULL_ERROR=1
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ natsort
nibabel
albumentations
hydra-core~=1.2.0
hydra-submitit-launcher
python-dotenv
19 changes: 19 additions & 0 deletions vital/config/hydra/launcher/alliance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# @package _global_

defaults:
- submitit_slurm@_group_

hydra:
launcher:
timeout_min: ${oc.select:run_time_min,60}
setup:
- "module load httpproxy" # load module allowing to connect to whitelisted domains
- "source $ALLIANCE_VENV_PATH/bin/activate" # activate the pre-installed virtual environment
- "rsync -a ${data.dataset_path} $SLURM_TMPDIR" # copy the dataset to the compute node
additional_parameters:
mail-user: ${oc.env:SLURM_MAIL_USER,null}
mail-type: ALL

# NOTE: Options meant to override train/task/data should be defined in a "final" launcher config (e.g. `beluga.yaml`)
# and not in launcher config meant to be used as in the defaults list. Otherwise, the order of the composition of the
# configs might not give priority to the launcher config.
16 changes: 16 additions & 0 deletions vital/config/hydra/launcher/beluga.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# @package _global_

defaults:
- alliance

hydra:
launcher:
gpus_per_node: 1
cpus_per_gpu: 10
mem_per_gpu: "47750M"

trainer:
enable_progress_bar: False

data:
num_workers: 9
1 change: 1 addition & 0 deletions vital/config/vital_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ defaults:
- trainer: default
- task: default # Incomplete config, but sets some common options
- data: default # Incomplete config, but sets some common options
- hydra/launcher: default # List launcher after trainer/task/data, so that it can override their configs
- callbacks:
- model_checkpoint
- logger: comet/online
Expand Down
15 changes: 15 additions & 0 deletions vital/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,21 @@ def _check_cfg(cfg: DictConfig) -> DictConfig:
with open_dict(cfg):
cfg.trainer.default_root_dir = os.getcwd()

# When running on a SLURM cluster, we will look to see if the dataset was copied on the compute node,
# and update the path if it was. Otherwise, we will use the path as-is.
if compute_node_dir := os.environ.get("SLURM_TMPDIR"):
dataset_name = Path(cfg.data.dataset_path).name
slurm_dataset_path = Path(compute_node_dir) / dataset_name
if slurm_dataset_path.exists():
cfg.data.dataset_path = str(slurm_dataset_path)
else:
logger.warning(
f"Running in a distributed computing environment, but we could not locate the dataset on the node "
f"the code is running on (e.g., no file named '{dataset_name}' in $SLURM_TMPDIR). "
f"You should consider copying the root of your dataset to $SLURM_TMPDIR in your job's setup so "
f"that we can detect it and use the local data for improved performance."
)

return cfg

@staticmethod
Expand Down

0 comments on commit c214ef4

Please sign in to comment.