config/UNet.yaml

base: &base

  # Training config
  weight_init: {conv_init: 'normal', conv_scale: 0.02, conv_bias: 0.}
  lambda_rho: 0 # weight for additional rho loss term
  full_scale: True # whether or not to use all 6 of the scales in U-Net
  global_batch_size: 64 # number of samples per training batch
  base_batch_size: 64 # single GPU batch size
  Nsamples: 65536
  Nsamples_val: 8192
  num_epochs: 10
  enable_amp: False
  enable_apex: False
  enable_benchy: False
  enable_jit: False
  expdir: '/logs'

  # params for setting learning rate (cosine decay schedule):
  #   start_lr: initial learning rate
  #   end_lr:  final learning rate
  #   warmup_steps: number of steps over which to do linear warm-up of learning rate
  #                 *not used when training single-GPU or when scaling='none'*
  #   scaling: 'none' initial lr doesn't change with respect to global batch size
  #            'linear' scale up according to lr_global_batchsize = (global_batchsize/base_batchsize)*lr_base_batchsize
  #            'sqrt' scale up according to lr_global_batchsize = sqrt(global_batchsize/base_batchsize)*lr_base_batchsize
  lr_schedule: {scaling: 'sqrt', start_lr: 2.E-4, end_lr: 0., warmup_steps: 128}

  # Data
  data_loader_config: 'lowmem' # choices: 'synthetic', 'inmem', 'lowmem', 'dali-lowmem'
  box_size: [1024, 512] # total size of simulation boxes (train, validation)
  data_size: 64 # size of crops for training
  num_data_workers: 2 # number of dataloader worker threads per proc
  N_out_channels: 5
  # HDF5 files for PyTorch native dataloader
  train_path: '/data/downsamp_2048crop_train.h5'
  val_path: '/data/downsamp_1024crop_valid.h5'
  # numpy files for DALI dataloader
  train_path_npy_data: '/data/downsamp_2048crop_train_data.npy'
  train_path_npy_label: '/data/downsamp_2048crop_train_label.npy'
  val_path_npy_data: '/data/downsamp_1024crop_valid_data.npy'
  val_path_npy_label: '/data/downsamp_1024crop_valid_label.npy'
  use_cache: None # set this to a cache dir (e.g., NVMe on CoriGPU) if you copied data there

# A short config for testing/profiling on a single A100
short: &short
  <<: *base
  Nsamples: 4096
  Nsamples_val: 512
  num_epochs: 4

# Short config with full optimizations
short_opt:
  <<: *short
  data_loader_config: 'dali-lowmem'
  enable_amp: True
  enable_apex: True
  enable_jit: True

# Full training, batch size 64, with optimizations
bs64_opt: &bs64_opt
  <<: *base
  data_loader_config: 'dali-lowmem'
  global_batch_size: 64
  enable_amp: True
  enable_apex: True
  enable_jit: True

bs256_opt: # 4 GPUs
  <<: *bs64_opt
  global_batch_size: 256
  num_epochs: 20

bs512_test: # 8 GPUs
  <<: *bs64_opt
  global_batch_size: 512

bs512_opt: # 8 GPUs
  <<: *bs64_opt
  global_batch_size: 512
  num_epochs: 40

bs2048_opt: # 32 GPUs
  <<: *bs64_opt
  global_batch_size: 2048
  num_epochs: 160

# Warning: training may be unstable
bs8192_opt: # 128 GPUs
  <<: *bs64_opt
  global_batch_size: 8192
  num_epochs: 320