kaiko-ai · nkaenzig · Oct 18, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/configs/core/tests/offline/embeddings.yaml b/configs/core/tests/offline/embeddings.yaml
@@ -26,11 +26,9 @@ model:
         out_features: &NUM_CLASSES 2
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: ${oc.env:LR_VALUE, 0.1}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/pathology/offline/classification/bach.yaml b/configs/vision/pathology/offline/classification/bach.yaml
@@ -53,11 +53,9 @@ model:
         out_features: &NUM_CLASSES 4
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.000625}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/pathology/offline/classification/camelyon16.yaml b/configs/vision/pathology/offline/classification/camelyon16.yaml
@@ -18,7 +18,7 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:

diff --git a/configs/vision/pathology/offline/classification/camelyon16_small.yaml b/configs/vision/pathology/offline/classification/camelyon16_small.yaml
@@ -18,7 +18,7 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:

diff --git a/configs/vision/pathology/offline/classification/crc.yaml b/configs/vision/pathology/offline/classification/crc.yaml
@@ -53,11 +53,9 @@ model:
         out_features: &NUM_CLASSES 9
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.01}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:
@@ -104,7 +102,7 @@ data:
             split: val
     dataloaders:
       train:
-        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 4096}
+        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
         num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
         shuffle: true
       val:

diff --git a/configs/vision/pathology/offline/classification/mhist.yaml b/configs/vision/pathology/offline/classification/mhist.yaml
@@ -18,12 +18,12 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:
           min_delta: 0
-          patience: 51
+          patience: 70
           monitor: *MONITOR_METRIC
           mode: *MONITOR_METRIC_MODE
       - class_path: eva.callbacks.ClassificationEmbeddingsWriter
@@ -53,11 +53,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.000625}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/pathology/offline/classification/patch_camelyon.yaml b/configs/vision/pathology/offline/classification/patch_camelyon.yaml
@@ -18,7 +18,7 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:
@@ -54,11 +54,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.01}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:
@@ -118,7 +116,7 @@ data:
             split: test
     dataloaders:
       train:
-        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 4096}
+        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
         num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
         shuffle: true
       val:

diff --git a/configs/vision/pathology/online/classification/bach.yaml b/configs/vision/pathology/online/classification/bach.yaml
@@ -45,11 +45,9 @@ model:
         out_features: &NUM_CLASSES 4
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.000625}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/pathology/online/classification/crc.yaml b/configs/vision/pathology/online/classification/crc.yaml
@@ -45,11 +45,9 @@ model:
         out_features: &NUM_CLASSES 9
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.01}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:
@@ -86,7 +84,7 @@ data:
           split: val
     dataloaders:
       train:
-        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 4096}
+        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
         num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
         shuffle: true
       val:

diff --git a/configs/vision/pathology/online/classification/mhist.yaml b/configs/vision/pathology/online/classification/mhist.yaml
@@ -17,12 +17,12 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:
           min_delta: 0
-          patience: 51
+          patience: 70
           monitor: *MONITOR_METRIC
           mode: *MONITOR_METRIC_MODE
     logger:
@@ -45,11 +45,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.000625}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/pathology/online/classification/patch_camelyon.yaml b/configs/vision/pathology/online/classification/patch_camelyon.yaml
@@ -17,7 +17,7 @@ trainer:
           filename: best
           save_last: true
           save_top_k: 1
-          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryAccuracy}
+          monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/BinaryBalancedAccuracy}
           mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
       - class_path: lightning.pytorch.callbacks.EarlyStopping
         init_args:
@@ -45,11 +45,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.01}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:
@@ -91,7 +89,7 @@ data:
           split: test
     dataloaders:
       train:
-        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 4096}
+        batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
         num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
         shuffle: true
       val:

diff --git a/configs/vision/tests/offline/panda.yaml b/configs/vision/tests/offline/panda.yaml
@@ -34,11 +34,9 @@ model:
         output_size: &NUM_CLASSES 6
     criterion: torch.nn.CrossEntropyLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: &LR_VALUE ${oc.env:LR_VALUE, 0.00004}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/tests/offline/patch_camelyon.yaml b/configs/vision/tests/offline/patch_camelyon.yaml
@@ -47,11 +47,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: ${oc.env:LR_VALUE, 0.1}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/configs/vision/tests/online/patch_camelyon.yaml b/configs/vision/tests/online/patch_camelyon.yaml
@@ -18,11 +18,9 @@ model:
         out_features: 1
     criterion: torch.nn.BCEWithLogitsLoss
     optimizer:
-      class_path: torch.optim.SGD
+      class_path: torch.optim.AdamW
       init_args:
-        lr: ${oc.env:LR_VALUE, 0.1}
-        momentum: 0.9
-        weight_decay: 0.0
+        lr: ${oc.env:LR_VALUE, 0.0003}
     lr_scheduler:
       class_path: torch.optim.lr_scheduler.CosineAnnealingLR
       init_args:

diff --git a/docs/leaderboards.md b/docs/leaderboards.md
@@ -38,26 +38,24 @@ We selected this approach to prioritize reliable, robust and fair FM-evaluation
 | **Dropout**                    | 0.0                       | 0.0                       | 0.0                       |
 | **Hidden activation function** | n/a                       | ReLU                      | n/a                       |
 | **Output activation function** | none                      | none                      | none                      |
-| **Number of steps**            | 12,500                    | 12,500 (2)                | 2,000                     |
-| **Base batch size**            | 4,096 (1)                 | 32                        | 64                        |
-| **Base learning rate**         | 0.01 (1)                  | 0.001                     | 0.0001                    |
-| **Early stopping**             | 5% * [Max epochs]         | 10% * [Max epochs] (3)    | 10% * [Max epochs] (3)    |
+| **Number of steps**            | 12,500                    | 12,500 (1)                | 2,000                     |
+| **Base batch size**            | 256                       | 32                        | 64                        |
+| **Base learning rate**         | 0.0003                    | 0.001                     | 0.0001                    |
+| **Early stopping**             | 5% * [Max epochs]         | 10% * [Max epochs] (2)    | 10% * [Max epochs] (2)    |
 | **Optimizer**                  | SGD                       | AdamW                     | AdamW                     |
 | **Momentum**                   | 0.9                       | n/a                       | n/a                       |
 | **Weight Decay**               | 0.0                       | n/a                       | n/a                       |
 | **betas**                      | n/a                       | [0.9, 0.999]              | [0.9, 0.999]              |
 | **LR Schedule**                | Cosine without warmup     | Cosine without warmup     | PolynomialLR              |
 | **Loss**                       | Cross entropy             | Cross entropy             | Dice                      |
-| **number of patches per slide**| 1                         | dataset specific (4)      | dataset specific (4)      |
+| **number of patches per slide**| 1                         | dataset specific (3)      | dataset specific (3)      |
 
 
-(1) For smaller datasets (e.g. BACH with 400 samples) we reduce the batch size to 256 and scale the learning rate accordingly.
+(1) Upper cap at a maximum of 100 epochs.
 
-(2) Upper cap at a maximum of 100 epochs.
+(2) Lower cap at a minimum of 8 epochs.
 
-(3) Lower cap at a minimum of 8 epochs.
-
-(4) Number of patches per slide depends on task and slide size. E.g. for PANDA and Camelyon16 we use a max of 1,000 and 10,000 random patches per slide respectively.
+(3) Number of patches per slide depends on task and slide size. E.g. for `PANDASmall` and `Camelyon16Small` we use a max of 200 and 1000 random patches per slide respectively.
 
 
 - [1]: [Virchow: A Million-Slide Digital Pathology Foundation Model, 2024](https://arxiv.org/pdf/2309.07778.pdf)