From d75d694a83389060d5e2697bb9fa6f827f14c251 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 14:01:55 -0400 Subject: [PATCH 001/307] begin narval changes --- only_for_me/narval/narval.md | 51 +++++++++++++++++++++++++++++ only_for_me/narval/pytorch_test.py | 23 +++++++++++++ only_for_me/narval/requirements.txt | 28 ++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 only_for_me/narval/narval.md create mode 100644 only_for_me/narval/pytorch_test.py create mode 100644 only_for_me/narval/requirements.txt diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md new file mode 100644 index 00000000..b7b0f729 --- /dev/null +++ b/only_for_me/narval/narval.md @@ -0,0 +1,51 @@ + +Sources + +https://docs.alliancecan.ca/wiki/Narval/en +https://docs.alliancecan.ca/wiki/AI_and_Machine_Learning +https://docs.alliancecan.ca/wiki/PyTorch +https://prashp.gitlab.io/post/compute-canada-tut/ + +ssh walml@narval.alliancecan.ca + + module purge + module avail + +Just for venv: + module load python/3.9.6 + + mkdir ~/envs + virtualenv --no-download ~/envs/zoobot39_dev + source ~/envs/zoobot39_dev/bin/activate + + avail_wheels "torch*" + +Latest is currently 2.0.1 (no 2.1.0 yet) + + pip install --no-index torch==2.0.1 torchvision torchtext torchaudio + +Storage under /home/user is not ideal, 50gb space. Use /project/def-bovy/walml (1TB space). +Can transfer data via rsync login node. + +Move ssh key for easy login (run on LOCAL desktop) + + ssh-copy-id walml@narval.alliancecan.ca + +Make new pub id key for github (back on cluster) + + ssh-keygen -t rsa -b 4096 + cat ~/.ssh/id_rsa.pub +and add to [Github](https://github.com/settings/keys) as normal + +Set up repos + + + cd /project/def-bovy/walml + +(I made a .bashrc alias, export PROJECT=/project/def-bovy/walml) + +pip install --no-index -r zoobot/only_for_me/narval/requirements.txt + +and my own cloned repos +pip install --no-deps -e galaxy-datasets +pip install --no-deps -e zoobot diff --git a/only_for_me/narval/pytorch_test.py b/only_for_me/narval/pytorch_test.py new file mode 100644 index 00000000..4a476970 --- /dev/null +++ b/only_for_me/narval/pytorch_test.py @@ -0,0 +1,23 @@ +import torch +x = torch.Tensor(5, 3) +print(x) +y = torch.rand(5, 3) +print(y) +# let us run the following only if CUDA is available +if torch.cuda.is_available(): + x = x.cuda() + y = y.cuda() + print(x + y) +else: + raise AssertionError('CUDA not available') + +# TODO DemoRings from galaxy-datasets + +from galaxy_datasets import galaxy_mnist + +root = '/project/def-bovy/walml/data/roots/galaxy_mnist' + +df, label_cols = galaxy_mnist(root, train=True, download=False) + + +# TODO import zoobot and use something diff --git a/only_for_me/narval/requirements.txt b/only_for_me/narval/requirements.txt new file mode 100644 index 00000000..f2eb1e23 --- /dev/null +++ b/only_for_me/narval/requirements.txt @@ -0,0 +1,28 @@ +torch +torchvision +torchaudio +pytorch-lightning +albumentations +pyro-ppl +torchmetrics +timm +wandb +h5py +astropy +# below already required by packages above + +# tqdm +# pillow +# numpy +# pandas +# scipy +# scikit-image +# scikit-learn +# matplotlib + +# not prebuilt, build fails! +# pyarrow + +# possibly trouble ahead with these two +# +# From 9e3414c1a13883b019a780e7430cfc165ec402e6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 20:56:24 -0400 Subject: [PATCH 002/307] try punch it --- only_for_me/narval/finetune.py | 52 +++++++++++++++++++++++++++++ only_for_me/narval/narval.md | 1 + only_for_me/narval/pytorch_test.py | 2 +- only_for_me/narval/requirements.txt | 3 ++ 4 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 only_for_me/narval/finetune.py diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py new file mode 100644 index 00000000..8c9c81b0 --- /dev/null +++ b/only_for_me/narval/finetune.py @@ -0,0 +1,52 @@ +import logging +import os +import shutil + +from zoobot.pytorch.training import finetune +from galaxy_datasets import galaxy_mnist +from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule + + +if __name__ == '__main__': + + logging.basicConfig(level=logging.INFO) + + data_dir = '/tmp/walml/finetune' + os.mkdirs(data_dir) + + remote_image_dir = '/project/def-bovy/walml/repos/galaxy-imagesets/roots/galaxy_mnist' + image_dir = data_dir + '/images' + shutil.copytree(remote_image_dir, image_dir) + + remote_zoobot_dir = '/project/def-bovy/walml/repos/zoobot' + + batch_size = 32 + num_workers= 8 + n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. + max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ + + train_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=True) + test_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=False) + + label_cols = ['label'] + num_classes = 4 + + # load a pretrained checkpoint saved here + # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch + checkpoint_loc = os.path.join(remote_zoobot_dir, 'data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt') + + datamodule = GalaxyDataModule( + label_cols=label_cols, + catalog=train_catalog, # very small, as a demo + batch_size=batch_size, # increase for faster training, decrease to avoid out-of-memory errors + num_workers=num_workers # TODO set to a little less than num. CPUs + ) + datamodule.setup() + model = finetune.FinetuneableZoobotClassifier( + checkpoint_loc=checkpoint_loc, + num_classes=num_classes, + n_blocks=n_blocks + ) + trainer = finetune.get_trainer(data_dir, accelerator='auto', max_epochs=max_epochs) + trainer.fit(model, datamodule) + trainer.test(model, datamodule) diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index b7b0f729..4758cf13 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -5,6 +5,7 @@ https://docs.alliancecan.ca/wiki/Narval/en https://docs.alliancecan.ca/wiki/AI_and_Machine_Learning https://docs.alliancecan.ca/wiki/PyTorch https://prashp.gitlab.io/post/compute-canada-tut/ +https://docs.alliancecan.ca/wiki/Python ssh walml@narval.alliancecan.ca diff --git a/only_for_me/narval/pytorch_test.py b/only_for_me/narval/pytorch_test.py index 4a476970..9f156b8d 100644 --- a/only_for_me/narval/pytorch_test.py +++ b/only_for_me/narval/pytorch_test.py @@ -17,7 +17,7 @@ root = '/project/def-bovy/walml/data/roots/galaxy_mnist' -df, label_cols = galaxy_mnist(root, train=True, download=False) +df, label_cols = galaxy_mnist(root, train=True, download=False) # must be already downloaded, no internet # TODO import zoobot and use something diff --git a/only_for_me/narval/requirements.txt b/only_for_me/narval/requirements.txt index f2eb1e23..9f22e9df 100644 --- a/only_for_me/narval/requirements.txt +++ b/only_for_me/narval/requirements.txt @@ -9,6 +9,9 @@ timm wandb h5py astropy +pandas +matplotlib +fastparquet # replacing pyarrow below # below already required by packages above # tqdm From 2f53b7eba13655eedf03472ca3a6a70124a13fbf Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 20:57:34 -0400 Subject: [PATCH 003/307] add .sh --- only_for_me/narval/finetune.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 only_for_me/narval/finetune.sh diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh new file mode 100644 index 00000000..406ccad8 --- /dev/null +++ b/only_for_me/narval/finetune.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --mem=32G +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=0:1:0 + +#### SBATCH --gres=gpu:a100:1 +#### SBATCH --mail-user= +#### SBATCH --mail-type=ALL + +PYTHON=/home/envs/zoobot39_dev/bin/python + +$PYTHON /project/def-bovy/walml/repos/zoobot/only_for_me/narval/finetune.py From cf73f5755364f69b7d49e682f33e8ee32d0ba7e6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:01:14 -0400 Subject: [PATCH 004/307] typo --- only_for_me/narval/finetune.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 406ccad8..2fd2c958 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,13 +1,13 @@ #!/bin/bash -#SBATCH --mem=32G +#SBATCH --mem=8G #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=8 +#SBATCH --ntasks-per-node=4 #SBATCH --time=0:1:0 #### SBATCH --gres=gpu:a100:1 #### SBATCH --mail-user= #### SBATCH --mail-type=ALL -PYTHON=/home/envs/zoobot39_dev/bin/python +PYTHON=/home/walml/envs/zoobot39_dev/bin/python $PYTHON /project/def-bovy/walml/repos/zoobot/only_for_me/narval/finetune.py From 4f96471b95a9862c10843868360c9c8becca1c55 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:04:46 -0400 Subject: [PATCH 005/307] typo --- only_for_me/narval/finetune.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 2fd2c958..6413f312 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -10,4 +10,4 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python -$PYTHON /project/def-bovy/walml/repos/zoobot/only_for_me/narval/finetune.py +$PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py From 2330475c1318757f378909a8d3b18ba966d2b5ff Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:10:27 -0400 Subject: [PATCH 006/307] run for real on cpu --- only_for_me/narval/finetune.py | 3 +++ only_for_me/narval/finetune.sh | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index 8c9c81b0..e93a1757 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -10,6 +10,7 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) + logging.info('Begin') data_dir = '/tmp/walml/finetune' os.mkdirs(data_dir) @@ -17,6 +18,7 @@ remote_image_dir = '/project/def-bovy/walml/repos/galaxy-imagesets/roots/galaxy_mnist' image_dir = data_dir + '/images' shutil.copytree(remote_image_dir, image_dir) + logging.info('Copied') remote_zoobot_dir = '/project/def-bovy/walml/repos/zoobot' @@ -27,6 +29,7 @@ train_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=True) test_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=False) + logging.info('Data ready') label_cols = ['label'] num_classes = 4 diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 6413f312..8ca8d103 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,8 +1,8 @@ #!/bin/bash -#SBATCH --mem=8G +#SBATCH --mem=16G #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 -#SBATCH --time=0:1:0 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=0:15:0 #### SBATCH --gres=gpu:a100:1 #### SBATCH --mail-user= From 2b71e574ee5f22a3cfa97b4a98babe943c52bc42 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:13:58 -0400 Subject: [PATCH 007/307] typo --- only_for_me/narval/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index e93a1757..f268d733 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -13,7 +13,7 @@ logging.info('Begin') data_dir = '/tmp/walml/finetune' - os.mkdirs(data_dir) + os.makedirs(data_dir) remote_image_dir = '/project/def-bovy/walml/repos/galaxy-imagesets/roots/galaxy_mnist' image_dir = data_dir + '/images' From a1a200e99c12ec982a4b5941fa3e4bf62366765f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:18:49 -0400 Subject: [PATCH 008/307] typo --- only_for_me/narval/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index f268d733..f75169ed 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -15,7 +15,7 @@ data_dir = '/tmp/walml/finetune' os.makedirs(data_dir) - remote_image_dir = '/project/def-bovy/walml/repos/galaxy-imagesets/roots/galaxy_mnist' + remote_image_dir = '/project/def-bovy/walml/repos/galaxy-datasets/roots/galaxy_mnist' image_dir = data_dir + '/images' shutil.copytree(remote_image_dir, image_dir) logging.info('Copied') From 5143194624032db6dfb1c3c0be4b46f2bd92d2d6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:20:59 -0400 Subject: [PATCH 009/307] typo --- only_for_me/narval/finetune.py | 4 ++-- only_for_me/narval/narval.md | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index f75169ed..b9bdf388 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -15,12 +15,12 @@ data_dir = '/tmp/walml/finetune' os.makedirs(data_dir) - remote_image_dir = '/project/def-bovy/walml/repos/galaxy-datasets/roots/galaxy_mnist' + remote_image_dir = '/project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist' image_dir = data_dir + '/images' shutil.copytree(remote_image_dir, image_dir) logging.info('Copied') - remote_zoobot_dir = '/project/def-bovy/walml/repos/zoobot' + remote_zoobot_dir = '/project/def-bovy/walml/zoobot' batch_size = 32 num_workers= 8 diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index 4758cf13..d6700aa7 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -50,3 +50,14 @@ pip install --no-index -r zoobot/only_for_me/narval/requirements.txt and my own cloned repos pip install --no-deps -e galaxy-datasets pip install --no-deps -e zoobot + + +Multi-node notes + +https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_2.html# +https://pytorch.org/docs/stable/elastic/run.html#environment-variables +https://github.com/webdataset/webdataset/issues/250 +https://github.com/webdataset/webdataset-lightning/blob/main/train.py +https://lightning.ai/forums/t/multi-gpu-multi-node-training-with-webdataset/2300 +https://webdataset.github.io/webdataset/multinode/ +https://webdataset.github.io/webdataset/creating/ \ No newline at end of file From 77ff2dadc0e877110961e525cd10f3625511d81e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:42:11 -0400 Subject: [PATCH 010/307] try in bash --- only_for_me/narval/finetune.py | 18 ++++-------------- only_for_me/narval/finetune.sh | 10 ++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index b9bdf388..63ddf957 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -12,23 +12,13 @@ logging.basicConfig(level=logging.INFO) logging.info('Begin') - data_dir = '/tmp/walml/finetune' - os.makedirs(data_dir) - - remote_image_dir = '/project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist' - image_dir = data_dir + '/images' - shutil.copytree(remote_image_dir, image_dir) - logging.info('Copied') - - remote_zoobot_dir = '/project/def-bovy/walml/zoobot' - batch_size = 32 num_workers= 8 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ - train_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=True) - test_catalog, _ = galaxy_mnist(root=data_dir, download=False, train=False) + train_catalog, _ = galaxy_mnist(root='/tmp/walml/finetune/data/galaxy_mnist', download=False, train=True) + test_catalog, _ = galaxy_mnist(root='/tmp/walml/finetune/data/galaxy_mnist', download=False, train=False) logging.info('Data ready') label_cols = ['label'] @@ -36,7 +26,7 @@ # load a pretrained checkpoint saved here # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch - checkpoint_loc = os.path.join(remote_zoobot_dir, 'data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt') + checkpoint_loc = '/project/bovy-dev/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' datamodule = GalaxyDataModule( label_cols=label_cols, @@ -50,6 +40,6 @@ num_classes=num_classes, n_blocks=n_blocks ) - trainer = finetune.get_trainer(data_dir, accelerator='auto', max_epochs=max_epochs) + trainer = finetune.get_trainer('/tmp/walml/finetune/checkpoints', accelerator='auto', max_epochs=max_epochs) trainer.fit(model, datamodule) trainer.test(model, datamodule) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 8ca8d103..4bdf074d 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -10,4 +10,14 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python +mkdir /tmp/walml/finetune +mkdir /tmp/walml/finetune/data +mkdir /tmp/walml/finetune/checkpoints + +cp /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist /tmp/walml/finetune/data/ + +ls /tmp/walml/finetune/data/galaxy_mnist + $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py + +ls /tmp/walml/finetune/checkpoints From fa666b0409354d3b1f5e922148b36a788f9905f5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:46:04 -0400 Subject: [PATCH 011/307] tmpdir --- only_for_me/narval/finetune.py | 8 +++++--- only_for_me/narval/finetune.sh | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index 63ddf957..5e0fc65c 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -12,13 +12,15 @@ logging.basicConfig(level=logging.INFO) logging.info('Begin') + logging.info(os.environ['SLURM_TMPDIR']) + batch_size = 32 num_workers= 8 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ - train_catalog, _ = galaxy_mnist(root='/tmp/walml/finetune/data/galaxy_mnist', download=False, train=True) - test_catalog, _ = galaxy_mnist(root='/tmp/walml/finetune/data/galaxy_mnist', download=False, train=False) + train_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=True) + test_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=False) logging.info('Data ready') label_cols = ['label'] @@ -40,6 +42,6 @@ num_classes=num_classes, n_blocks=n_blocks ) - trainer = finetune.get_trainer('/tmp/walml/finetune/checkpoints', accelerator='auto', max_epochs=max_epochs) + trainer = finetune.get_trainer(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='auto', max_epochs=max_epochs) trainer.fit(model, datamodule) trainer.test(model, datamodule) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 4bdf074d..e8d644ca 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -10,14 +10,14 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python -mkdir /tmp/walml/finetune -mkdir /tmp/walml/finetune/data -mkdir /tmp/walml/finetune/checkpoints +mkdir $SLURM_TMPDIR/walml/finetune +mkdir $SLURM_TMPDIR/walml/finetune/data +mkdir $SLURM_TMPDIR/walml/finetune/checkpoints -cp /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist /tmp/walml/finetune/data/ +cp /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ -ls /tmp/walml/finetune/data/galaxy_mnist +ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py -ls /tmp/walml/finetune/checkpoints +ls $SLURM_TMPDIR/walml/finetune/checkpoints From abe7462ec2274a264fe2aa5c8bf84b98cc1ed1bb Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:46:41 -0400 Subject: [PATCH 012/307] typo --- only_for_me/narval/finetune.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index e8d644ca..a3d93b20 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -10,6 +10,7 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python +mkdir $SLURM_TMPDIR/walml mkdir $SLURM_TMPDIR/walml/finetune mkdir $SLURM_TMPDIR/walml/finetune/data mkdir $SLURM_TMPDIR/walml/finetune/checkpoints From 70bfb4099b686e6d6f82bf9768cb308f83579c92 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:46:54 -0400 Subject: [PATCH 013/307] -r --- only_for_me/narval/finetune.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index a3d93b20..db082ae4 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -15,7 +15,7 @@ mkdir $SLURM_TMPDIR/walml/finetune mkdir $SLURM_TMPDIR/walml/finetune/data mkdir $SLURM_TMPDIR/walml/finetune/checkpoints -cp /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ +cp -r /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist From 94ed1cf81e36a5819a4dee0c47c8aa9952788002 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 21:53:57 -0400 Subject: [PATCH 014/307] typo --- only_for_me/narval/finetune.py | 4 ++++ only_for_me/narval/finetune.sh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index 5e0fc65c..b59f240c 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -14,6 +14,10 @@ logging.info(os.environ['SLURM_TMPDIR']) + import glob + logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) + logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) + batch_size = 32 num_workers= 8 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index db082ae4..69051d1a 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -15,7 +15,7 @@ mkdir $SLURM_TMPDIR/walml/finetune mkdir $SLURM_TMPDIR/walml/finetune/data mkdir $SLURM_TMPDIR/walml/finetune/checkpoints -cp -r /project/def-bovy/walml/galaxy-datasets/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ +cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist From 2a7ffa5729a3167c45ec7109f8a79caae5c4fef0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 22:01:16 -0400 Subject: [PATCH 015/307] typo --- only_for_me/narval/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index b59f240c..18499e84 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -32,7 +32,7 @@ # load a pretrained checkpoint saved here # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch - checkpoint_loc = '/project/bovy-dev/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' + checkpoint_loc = '/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' datamodule = GalaxyDataModule( label_cols=label_cols, From e5c24f38b5f50c25fdf574e8e74fbf033ad1c662 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 22:14:43 -0400 Subject: [PATCH 016/307] map cpu if needed --- zoobot/pytorch/training/finetune.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 31f33b82..bd9c8dee 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -439,9 +439,13 @@ def load_pretrained_encoder(checkpoint_loc: str) -> torch.nn.Sequential: Returns: torch.nn.Sequential: pretrained PyTorch encoder within that LightningModule. """ - return define_model.ZoobotTree.load_from_checkpoint( - checkpoint_loc).encoder - + if torch.cuda.is_available(): + map_location = None + else: + # necessary to load gpu-trained model on cpu + map_location = torch.device('cpu') + return define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc, map_location=map_location).encoder + def get_trainer( save_dir: str, From f9afc3ff1d0ae59a24347388112c60391a41dc3a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 22:38:17 -0400 Subject: [PATCH 017/307] A100 time --- only_for_me/narval/finetune.py | 11 +++++++---- only_for_me/narval/finetune.sh | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index 18499e84..f56db02f 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -14,11 +14,14 @@ logging.info(os.environ['SLURM_TMPDIR']) - import glob - logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) - logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) + # import glob + # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) + # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) - batch_size = 32 + import torch + assert torch.cuda.is_available() + + batch_size = 128 num_workers= 8 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 69051d1a..52a40df8 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -3,8 +3,8 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --time=0:15:0 +# SBATCH --gres=gpu:a100:1 -#### SBATCH --gres=gpu:a100:1 #### SBATCH --mail-user= #### SBATCH --mail-type=ALL From 68c4dec1818815db308398e2517439365e5b2f77 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 22:40:56 -0400 Subject: [PATCH 018/307] cuda --- only_for_me/narval/finetune.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 52a40df8..f47b57f2 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -8,6 +8,8 @@ #### SBATCH --mail-user= #### SBATCH --mail-type=ALL +module load StdEnv/2020 # CUDA etc + PYTHON=/home/walml/envs/zoobot39_dev/bin/python mkdir $SLURM_TMPDIR/walml From da92e4198e6c769d1946d7d98b4e7870e2bcd384 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 17 Oct 2023 22:41:32 -0400 Subject: [PATCH 019/307] typo --- only_for_me/narval/finetune.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index f47b57f2..e2cc3278 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -3,12 +3,13 @@ #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --time=0:15:0 -# SBATCH --gres=gpu:a100:1 +#SBATCH --gres=gpu:a100:1 #### SBATCH --mail-user= #### SBATCH --mail-type=ALL module load StdEnv/2020 # CUDA etc +nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python From b063b49e8bc4436121845d2931cd4516fbdab7e7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 12:34:34 -0400 Subject: [PATCH 020/307] try wandb --- only_for_me/narval/finetune.py | 19 ++++++++++++++++--- only_for_me/narval/finetune.sh | 18 ++++++++++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index f56db02f..e4bf8580 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -2,6 +2,8 @@ import os import shutil +from pytorch_lightning.loggers import WandbLogger + from zoobot.pytorch.training import finetune from galaxy_datasets import galaxy_mnist from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule @@ -19,9 +21,10 @@ # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) import torch + torch.set_float32_matmul_precision('medium') assert torch.cuda.is_available() - batch_size = 128 + batch_size = 256 num_workers= 8 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ @@ -36,6 +39,8 @@ # load a pretrained checkpoint saved here # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch checkpoint_loc = '/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' + + logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) datamodule = GalaxyDataModule( label_cols=label_cols, @@ -49,6 +54,14 @@ num_classes=num_classes, n_blocks=n_blocks ) - trainer = finetune.get_trainer(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='auto', max_epochs=max_epochs) + trainer = finetune.get_trainer( + os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), + accelerator='gpu', + devices=2, + strategy='ddp', + precision='16-mixed', + max_epochs=max_epochs, + logger=logger + ) trainer.fit(model, datamodule) - trainer.test(model, datamodule) + # trainer.test(model, datamodule) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index e2cc3278..8f076053 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,9 +1,15 @@ #!/bin/bash -#SBATCH --mem=16G +#SBATCH --mem=32G #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=8 -#SBATCH --time=0:15:0 -#SBATCH --gres=gpu:a100:1 +#SBATCH --time=0:10:0 +#SBATCH --ntasks-per-node=16 +#SBATCH --gres=gpu:a100:2 + +#### SBATCH --mem=16G +#### SBATCH --nodes=1 +#### SBATCH --time=0:10:0 +#### SBATCH --ntasks-per-node=8 +#### SBATCH --gres=gpu:a100:1 #### SBATCH --mail-user= #### SBATCH --mail-type=ALL @@ -22,6 +28,10 @@ cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetu ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist +pip install --no-index wandb + +wandb offline # only write metadata locally + $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py ls $SLURM_TMPDIR/walml/finetune/checkpoints From 5f610592053f7ce5a363d81cebebbef1555c7cb0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 12:47:38 -0400 Subject: [PATCH 021/307] try local env --- only_for_me/narval/finetune.py | 4 ++-- only_for_me/narval/finetune.sh | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index e4bf8580..f6a80831 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -11,7 +11,7 @@ if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) + logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) logging.info('Begin') logging.info(os.environ['SLURM_TMPDIR']) @@ -25,7 +25,7 @@ assert torch.cuda.is_available() batch_size = 256 - num_workers= 8 + num_workers= 4 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 8f076053..2cea3343 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --mem=32G #SBATCH --nodes=1 -#SBATCH --time=0:10:0 -#SBATCH --ntasks-per-node=16 +#SBATCH --time=0:20:0 +#SBATCH --ntasks-per-node=8 #SBATCH --gres=gpu:a100:2 #### SBATCH --mem=16G @@ -17,7 +17,16 @@ module load StdEnv/2020 # CUDA etc nvidia-smi -PYTHON=/home/walml/envs/zoobot39_dev/bin/python +# PYTHON=/home/walml/envs/zoobot39_dev/bin/python + +module load python/3.9.6 +virtualenv --no-download $SLURM_TMPDIR/env +source $SLURM_TMPDIR/env/bin/activate +pip install --no-index -r zoobot/only_for_me/narval/requirements.txt +cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ +cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ +pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets +pip install --no-deps -e $SLURM_TMPDIR/zoobot mkdir $SLURM_TMPDIR/walml mkdir $SLURM_TMPDIR/walml/finetune @@ -32,6 +41,7 @@ pip install --no-index wandb wandb offline # only write metadata locally -$PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py +# $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py +python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py ls $SLURM_TMPDIR/walml/finetune/checkpoints From ace6dff5e9fa5f501b802689ce91898cc353b4e8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 12:53:55 -0400 Subject: [PATCH 022/307] add env test --- only_for_me/narval/env_test.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 only_for_me/narval/env_test.sh diff --git a/only_for_me/narval/env_test.sh b/only_for_me/narval/env_test.sh new file mode 100644 index 00000000..036844a7 --- /dev/null +++ b/only_for_me/narval/env_test.sh @@ -0,0 +1,18 @@ +# SBATCH --mem=16G +# SBATCH --nodes=1 +# SBATCH --time=0:5:0 +# SBATCH --ntasks-per-node=4 + +echo "$now" + +module load StdEnv/2020 +module load python/3.9.6 +virtualenv --no-download $SLURM_TMPDIR/env +source $SLURM_TMPDIR/env/bin/activate +pip install --no-index -r /project/def-bovy/zoobot/only_for_me/narval/requirements.txt +cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ +cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ +pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets +pip install --no-deps -e $SLURM_TMPDIR/zoobot + +echo "$now" \ No newline at end of file From 8dc7fc866b49d09190100b7361eff75d83b454a7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 12:54:23 -0400 Subject: [PATCH 023/307] typo --- only_for_me/narval/env_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/only_for_me/narval/env_test.sh b/only_for_me/narval/env_test.sh index 036844a7..948c783b 100644 --- a/only_for_me/narval/env_test.sh +++ b/only_for_me/narval/env_test.sh @@ -1,3 +1,4 @@ +#!/bin/bash # SBATCH --mem=16G # SBATCH --nodes=1 # SBATCH --time=0:5:0 From cd17b92c73ea29aa6187bb61bebdaa227c18c8cf Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 13:16:54 -0400 Subject: [PATCH 024/307] retry env --- only_for_me/narval/env_test.sh | 12 ++++++------ only_for_me/narval/finetune.sh | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/only_for_me/narval/env_test.sh b/only_for_me/narval/env_test.sh index 948c783b..b472fae8 100644 --- a/only_for_me/narval/env_test.sh +++ b/only_for_me/narval/env_test.sh @@ -1,8 +1,8 @@ #!/bin/bash -# SBATCH --mem=16G -# SBATCH --nodes=1 -# SBATCH --time=0:5:0 -# SBATCH --ntasks-per-node=4 +#SBATCH --mem=16G +#SBATCH --nodes=1 +#SBATCH --time=0:5:0 +#SBATCH --ntasks-per-node=4 echo "$now" @@ -10,10 +10,10 @@ module load StdEnv/2020 module load python/3.9.6 virtualenv --no-download $SLURM_TMPDIR/env source $SLURM_TMPDIR/env/bin/activate -pip install --no-index -r /project/def-bovy/zoobot/only_for_me/narval/requirements.txt +pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets pip install --no-deps -e $SLURM_TMPDIR/zoobot -echo "$now" \ No newline at end of file +echo "$now" diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 2cea3343..c4c843e0 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -22,7 +22,7 @@ nvidia-smi module load python/3.9.6 virtualenv --no-download $SLURM_TMPDIR/env source $SLURM_TMPDIR/env/bin/activate -pip install --no-index -r zoobot/only_for_me/narval/requirements.txt +pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets From 737fb199664d9e53c8b532b04363e1f730b9d928 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:02:20 -0400 Subject: [PATCH 025/307] ddp hangs, add srun and NCCL_BLOCKING_WAIT --- only_for_me/narval/finetune.py | 2 ++ only_for_me/narval/finetune.sh | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index f6a80831..b7ac7359 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -58,9 +58,11 @@ os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='gpu', devices=2, + nodes=1, strategy='ddp', precision='16-mixed', max_epochs=max_epochs, + enable_progress_bar=False, logger=logger ) trainer.fit(model, datamodule) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index c4c843e0..ecd6156a 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -2,7 +2,8 @@ #SBATCH --mem=32G #SBATCH --nodes=1 #SBATCH --time=0:20:0 -#SBATCH --ntasks-per-node=8 +#SBATCH --tasks-per-node=2 +#SBATCH --cpus-per-task=4 #SBATCH --gres=gpu:a100:2 #### SBATCH --mem=16G @@ -41,7 +42,12 @@ pip install --no-index wandb wandb offline # only write metadata locally +export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. +# echo "r$SLURM_NODEID master: $MASTER_ADDR" +# echo "r$SLURM_NODEID Launching python script" + # $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py -python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py +srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py ls $SLURM_TMPDIR/walml/finetune/checkpoints From db18202e2231b01ad97f8348d57d3350c5964f2e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:19:01 -0400 Subject: [PATCH 026/307] use normal env for speed --- only_for_me/narval/finetune.py | 2 +- only_for_me/narval/finetune.sh | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index b7ac7359..a273618c 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -58,7 +58,7 @@ os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='gpu', devices=2, - nodes=1, + num_nodes=1, strategy='ddp', precision='16-mixed', max_epochs=max_epochs, diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index ecd6156a..11eebc0b 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -18,16 +18,16 @@ module load StdEnv/2020 # CUDA etc nvidia-smi -# PYTHON=/home/walml/envs/zoobot39_dev/bin/python +PYTHON=/home/walml/envs/zoobot39_dev/bin/python -module load python/3.9.6 -virtualenv --no-download $SLURM_TMPDIR/env -source $SLURM_TMPDIR/env/bin/activate -pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt -cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ -cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ -pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets -pip install --no-deps -e $SLURM_TMPDIR/zoobot +# module load python/3.9.6 +# virtualenv --no-download $SLURM_TMPDIR/env +# source $SLURM_TMPDIR/env/bin/activate +# pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt +# cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ +# cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ +# pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets +# pip install --no-deps -e $SLURM_TMPDIR/zoobot mkdir $SLURM_TMPDIR/walml mkdir $SLURM_TMPDIR/walml/finetune @@ -38,8 +38,6 @@ cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetu ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist -pip install --no-index wandb - wandb offline # only write metadata locally export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. @@ -47,7 +45,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" -# $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py -srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py +$PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py +# srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py ls $SLURM_TMPDIR/walml/finetune/checkpoints From bc6cc6ec84e661364ce9a950c5722a3582ccd708 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:32:00 -0400 Subject: [PATCH 027/307] exact example --- only_for_me/narval/finetune.py | 2 + only_for_me/narval/pytorch-ddp-test-pl.py | 78 +++++++++++++++++++++++ only_for_me/narval/pytorch-ddp-test-pl.sh | 20 ++++++ 3 files changed, 100 insertions(+) create mode 100644 only_for_me/narval/pytorch-ddp-test-pl.py create mode 100644 only_for_me/narval/pytorch-ddp-test-pl.sh diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index a273618c..cadce1a3 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -16,6 +16,8 @@ logging.info(os.environ['SLURM_TMPDIR']) + os.environ['NCCL_BLOCKING_WAIT'] = 1 + # import glob # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) diff --git a/only_for_me/narval/pytorch-ddp-test-pl.py b/only_for_me/narval/pytorch-ddp-test-pl.py new file mode 100644 index 00000000..0c89cd3f --- /dev/null +++ b/only_for_me/narval/pytorch-ddp-test-pl.py @@ -0,0 +1,78 @@ +# import datetime + +import torch +from torch import nn +import torch.nn.functional as F + +import pytorch_lightning as pl + +# import torchvision +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 +from torch.utils.data import DataLoader + +import argparse + +parser = argparse.ArgumentParser(description='cifar10 classification models, pytorch-lightning parallel test') +parser.add_argument('--lr', default=0.1, help='') +parser.add_argument('--max_epochs', type=int, default=4, help='') +parser.add_argument('--batch_size', type=int, default=768, help='') +parser.add_argument('--num_workers', type=int, default=0, help='') + + +def main(): + print("Starting...") + + args = parser.parse_args() + + class Net(pl.LightningModule): + + def __init__(self): + super(Net, self).__init__() + + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=args.lr) + + net = Net() + + """ Here we initialize a Trainer() explicitly with 1 node and 2 GPUs per node. + To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs + and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. + We also set progress_bar_refresh_rate=0 to avoid writing a progress bar to the logs, + which can cause issues due to updating logs too frequently.""" + + trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy='ddp', max_epochs = args.max_epochs, enable_progress_bar=False) + + transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + + dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) + + train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) + + trainer.fit(net,train_loader) + + +if __name__=='__main__': + main() \ No newline at end of file diff --git a/only_for_me/narval/pytorch-ddp-test-pl.sh b/only_for_me/narval/pytorch-ddp-test-pl.sh new file mode 100644 index 00000000..c37074d8 --- /dev/null +++ b/only_for_me/narval/pytorch-ddp-test-pl.sh @@ -0,0 +1,20 @@ +#!/bin/bash +#SBATCH --nodes 1 +#SBATCH --gres=gpu:2 # Request 2 GPU "generic resources”. +#SBATCH --tasks-per-node=2 # Request 1 process per GPU. You will get 1 CPU per process by default. Request more CPUs with the "cpus-per-task" parameter to enable multiple data-loader workers to load data in parallel. +#SBATCH --mem=8G +#SBATCH --time=0-03:00 +#SBATCH --output=%N-%j.out + +module load python # Using Default Python version - Make sure to choose a version that suits your application +virtualenv --no-download $SLURM_TMPDIR/env +source $SLURM_TMPDIR/env/bin/activate +pip install torchvision pytorch-lightning --no-index + +export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. + +# PyTorch Lightning will query the environment to figure out if it is running inside a SLURM batch job +# If it is, it expects the user to have requested one task per GPU. +# If you do not ask for 1 task per GPU, and you do not run your script with "srun", your job will fail! + +srun python pytorch-ddp-test-pl.py --batch_size 256 From a15ce4746396cb1cfc7d731a12254abb09865f53 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:33:06 -0400 Subject: [PATCH 028/307] reduce time needed --- only_for_me/narval/pytorch-ddp-test-pl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/pytorch-ddp-test-pl.sh b/only_for_me/narval/pytorch-ddp-test-pl.sh index c37074d8..0ce7f573 100644 --- a/only_for_me/narval/pytorch-ddp-test-pl.sh +++ b/only_for_me/narval/pytorch-ddp-test-pl.sh @@ -3,7 +3,7 @@ #SBATCH --gres=gpu:2 # Request 2 GPU "generic resources”. #SBATCH --tasks-per-node=2 # Request 1 process per GPU. You will get 1 CPU per process by default. Request more CPUs with the "cpus-per-task" parameter to enable multiple data-loader workers to load data in parallel. #SBATCH --mem=8G -#SBATCH --time=0-03:00 +#SBATCH --time=0-00:15 #SBATCH --output=%N-%j.out module load python # Using Default Python version - Make sure to choose a version that suits your application From 516a569b309787c1daf3f7aab4bceafd12ba5be4 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:36:38 -0400 Subject: [PATCH 029/307] path typo --- only_for_me/narval/pytorch-ddp-test-pl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/pytorch-ddp-test-pl.sh b/only_for_me/narval/pytorch-ddp-test-pl.sh index 0ce7f573..032731cb 100644 --- a/only_for_me/narval/pytorch-ddp-test-pl.sh +++ b/only_for_me/narval/pytorch-ddp-test-pl.sh @@ -17,4 +17,4 @@ export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-G # If it is, it expects the user to have requested one task per GPU. # If you do not ask for 1 task per GPU, and you do not run your script with "srun", your job will fail! -srun python pytorch-ddp-test-pl.py --batch_size 256 +srun python /project/def-bovy/walml/zoobot/only_for_me/narval/pytorch-ddp-test-pl.py --batch_size 256 From d1b8632766887dec276f18e4672ea160ac752d6d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 14:41:31 -0400 Subject: [PATCH 030/307] download cifar10 --- only_for_me/narval/pytorch-ddp-test-pl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/pytorch-ddp-test-pl.py b/only_for_me/narval/pytorch-ddp-test-pl.py index 0c89cd3f..eb35ed85 100644 --- a/only_for_me/narval/pytorch-ddp-test-pl.py +++ b/only_for_me/narval/pytorch-ddp-test-pl.py @@ -67,7 +67,7 @@ def configure_optimizers(self): transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) + dataset_train = CIFAR10(root='/project/def-bovy/walml/data/roots/cifar10', train=True, download=False, transform=transform_train) train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) From f4839e09287ba7d57fcd7f2c60d7494268020332 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 15:07:23 -0400 Subject: [PATCH 031/307] example works. Try simplify mine - no wandb - add srun - no StdEnv --- only_for_me/narval/finetune.py | 17 +++++++++-------- only_for_me/narval/finetune.sh | 8 ++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index cadce1a3..beb961b9 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -16,18 +16,18 @@ logging.info(os.environ['SLURM_TMPDIR']) - os.environ['NCCL_BLOCKING_WAIT'] = 1 + # os.environ['NCCL_BLOCKING_WAIT'] = 1 # import glob # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) - import torch - torch.set_float32_matmul_precision('medium') - assert torch.cuda.is_available() + # import torch + # torch.set_float32_matmul_precision('medium') + # assert torch.cuda.is_available() - batch_size = 256 - num_workers= 4 + batch_size = 128 + num_workers= 1 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ @@ -42,7 +42,8 @@ # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch checkpoint_loc = '/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' - logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) + # logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) + logger = None datamodule = GalaxyDataModule( label_cols=label_cols, @@ -62,7 +63,7 @@ devices=2, num_nodes=1, strategy='ddp', - precision='16-mixed', + # precision='16-mixed', max_epochs=max_epochs, enable_progress_bar=False, logger=logger diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 11eebc0b..3556f1af 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -3,9 +3,9 @@ #SBATCH --nodes=1 #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=2 -#SBATCH --cpus-per-task=4 #SBATCH --gres=gpu:a100:2 +#### SBATCH --cpus-per-task=4 #### SBATCH --mem=16G #### SBATCH --nodes=1 #### SBATCH --time=0:10:0 @@ -15,7 +15,7 @@ #### SBATCH --mail-user= #### SBATCH --mail-type=ALL -module load StdEnv/2020 # CUDA etc +# module load StdEnv/2020 # CUDA etc nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python @@ -38,14 +38,14 @@ cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetu ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist -wandb offline # only write metadata locally +# wandb offline # only write metadata locally export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" -$PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py +srun $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py ls $SLURM_TMPDIR/walml/finetune/checkpoints From 052b0d52bb3502528eda061eaf2c2d64667f792e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 15:33:24 -0400 Subject: [PATCH 032/307] worked :) reactivate except for leaving srun on --- only_for_me/narval/finetune.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index beb961b9..301ba507 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -22,12 +22,12 @@ # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) - # import torch - # torch.set_float32_matmul_precision('medium') - # assert torch.cuda.is_available() + import torch + torch.set_float32_matmul_precision('medium') + assert torch.cuda.is_available() batch_size = 128 - num_workers= 1 + num_workers = 12 n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ @@ -42,8 +42,8 @@ # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch checkpoint_loc = '/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' - # logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) - logger = None + logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) + # logger = None datamodule = GalaxyDataModule( label_cols=label_cols, @@ -63,7 +63,7 @@ devices=2, num_nodes=1, strategy='ddp', - # precision='16-mixed', + precision='16-mixed', max_epochs=max_epochs, enable_progress_bar=False, logger=logger From adf9f261319ddba332ac18780d89c1003a4e3e35 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 18 Oct 2023 16:08:10 -0400 Subject: [PATCH 033/307] runs :) crank it a little --- only_for_me/narval/finetune.py | 8 ++++---- only_for_me/narval/finetune.sh | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index 301ba507..d77caf82 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -26,10 +26,10 @@ torch.set_float32_matmul_precision('medium') assert torch.cuda.is_available() - batch_size = 128 - num_workers = 12 - n_blocks = 1 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. - max_epochs = 6 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ + batch_size = 512 + num_workers = 10 + n_blocks = 3 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. + max_epochs = 60 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ train_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=True) test_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=False) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 3556f1af..30c680d8 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -3,9 +3,10 @@ #SBATCH --nodes=1 #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=2 +#SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 -#### SBATCH --cpus-per-task=4 +#### #### SBATCH --mem=16G #### SBATCH --nodes=1 #### SBATCH --time=0:10:0 From 257b4dc68e3eed133cd29e3824d6a5bb2e2624d7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 10:51:11 -0400 Subject: [PATCH 034/307] try 2 node run --- only_for_me/narval/finetune.py | 2 +- only_for_me/narval/finetune.sh | 9 ++++++++- only_for_me/narval/narval.md | 7 ++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py index d77caf82..cde0e6ca 100644 --- a/only_for_me/narval/finetune.py +++ b/only_for_me/narval/finetune.py @@ -61,7 +61,7 @@ os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), accelerator='gpu', devices=2, - num_nodes=1, + num_nodes=2, strategy='ddp', precision='16-mixed', max_epochs=max_epochs, diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 30c680d8..84506204 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -1,11 +1,18 @@ #!/bin/bash #SBATCH --mem=32G -#SBATCH --nodes=1 +#SBATCH --nodes=2 #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=2 #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 +#### SBATCH --mem=32G +#### SBATCH --nodes=1 +#### SBATCH --time=0:20:0 +#### SBATCH --tasks-per-node=2 +#### SBATCH --cpus-per-task=12 +#### SBATCH --gres=gpu:a100:2 + #### #### SBATCH --mem=16G #### SBATCH --nodes=1 diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index d6700aa7..b1fa5eea 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -8,6 +8,7 @@ https://prashp.gitlab.io/post/compute-canada-tut/ https://docs.alliancecan.ca/wiki/Python ssh walml@narval.alliancecan.ca +ssh-copy-id to avoid password in future module purge module avail @@ -51,8 +52,12 @@ and my own cloned repos pip install --no-deps -e galaxy-datasets pip install --no-deps -e zoobot +Run training + +sbatch only_for_me/narval/finetune.sh + +Works with simple images on multi-GPU, single node -Multi-node notes https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_2.html# https://pytorch.org/docs/stable/elastic/run.html#environment-variables From 6a375e3d6f43641219f87942cdd39fa040fef607 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:26:18 -0400 Subject: [PATCH 035/307] webds works, try cluster --- .gitignore | 4 +- only_for_me/narval/finetune.sh | 1 + only_for_me/narval/gz_decals_webdataset.py | 120 ++++++++++++++++ only_for_me/narval/train.py | 130 ++++++++++++++++++ .../training/train_with_pytorch_lightning.py | 78 +++++++---- zoobot/pytorch/training/webdatamodule.py | 130 ++++++++++++++++++ 6 files changed, 432 insertions(+), 31 deletions(-) create mode 100644 only_for_me/narval/gz_decals_webdataset.py create mode 100644 only_for_me/narval/train.py create mode 100644 zoobot/pytorch/training/webdatamodule.py diff --git a/.gitignore b/.gitignore index 6199eb12..d7ae58f9 100755 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,6 @@ results hparams.yaml -data/pretrained_models \ No newline at end of file +data/pretrained_models + +*.tar \ No newline at end of file diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh index 84506204..665cfba2 100644 --- a/only_for_me/narval/finetune.sh +++ b/only_for_me/narval/finetune.sh @@ -6,6 +6,7 @@ #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 +# https://github.com/webdataset/webdataset-lightning/blob/main/simple_cluster.py #### SBATCH --mem=32G #### SBATCH --nodes=1 #### SBATCH --time=0:20:0 diff --git a/only_for_me/narval/gz_decals_webdataset.py b/only_for_me/narval/gz_decals_webdataset.py new file mode 100644 index 00000000..4abce9eb --- /dev/null +++ b/only_for_me/narval/gz_decals_webdataset.py @@ -0,0 +1,120 @@ +import logging +import os +import shutil +import sys +import cv2 +import json +from itertools import islice +import glob + +import tqdm +import numpy as np +import pandas as pd +from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets + +from galaxy_datasets.shared import label_metadata +from galaxy_datasets import gz_decals_5 +from galaxy_datasets.transforms import default_transforms +from galaxy_datasets.pytorch import galaxy_dataset + +import webdataset as wds + +def galaxy_to_wds(galaxy: pd.Series, label_cols): + + im = cv2.imread(galaxy['file_loc']) + # cv2 loads BGR for 'history', fix + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + labels = json.dumps(galaxy[label_cols].to_dict()) + id_str = str(galaxy['id_str']) + # print(id_str) + return { + "__key__": id_str, + "image.jpg": im, + "labels.json": labels + } + +def df_to_wds(df: pd.DataFrame, label_cols, save_loc, n_shards): + df['id_str'] = df['id_str'].str.replace('.', '_') + + shard_dfs = np.array_split(df, n_shards) + print('shards: ', len(shard_dfs)) + print('shard size: ', len(shard_dfs[0])) + for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): + shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') + print(shard_save_loc) + sink = wds.TarWriter(shard_save_loc) + for index, galaxy in shard_df.iterrows(): + sink.write(galaxy_to_wds(galaxy, label_cols)) + sink.close() + +def check_wds(wds_loc): + + dataset = wds.WebDataset(wds_loc) \ + .decode("rgb") + + for sample in islice(dataset, 0, 3): + print(sample['__key__']) + print(sample['image.jpg'].shape) # .decode(jpg) converts to decoded to 0-1 RGB float, was 0-255 + print(type(sample['labels.json'])) # automatically decoded + +def identity(x): + # no lambda to be pickleable + return x + + +def load_wds(wds_loc): + + augmentation_transform = default_transforms() # A.Compose object + def do_transform(img): + return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) + + dataset = wds.WebDataset(wds_loc) \ + .decode("rgb") \ + .to_tuple('image.jpg', 'labels.json') \ + .map_tuple(do_transform, identity) + + for sample in islice(dataset, 0, 3): + print(sample[0].shape) + print(sample[1]) + + +def main(): + + train_catalog, _ = gz_decals_5(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=False, train=True) + + train_catalog = train_catalog[:512*64] + label_cols = label_metadata.decals_dr5_ortho_label_cols + + save_loc = "gz_decals_5_train.tar" + + # df_to_wds(train_catalog, label_cols, save_loc, n_shards=8) + + # check_wds(save_loc) + + # load_wds(save_loc) + + import zoobot.pytorch.training.webdatamodule as webdatamodule + + wdm = webdatamodule.WebDataModule( + train_urls=glob.glob(save_loc.replace('.tar', '_*.tar')), + val_urls=[], + # train_size=len(train_catalog), + # val_size=0, + label_cols=label_cols, + num_workers=1 + ) + wdm.setup('fit') + + for sample in islice(wdm.train_dataloader(), 0, 3): + images, labels = sample + print(images.shape) + # print(len(labels)) # list of dicts + print(labels) + exit() + + + +if __name__ == '__main__': + + main() + diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py new file mode 100644 index 00000000..eb616450 --- /dev/null +++ b/only_for_me/narval/train.py @@ -0,0 +1,130 @@ +import logging +import os +import argparse +import glob + +from pytorch_lightning.loggers import WandbLogger +import wandb + +from zoobot.pytorch.training import train_with_pytorch_lightning +from zoobot.shared import benchmark_datasets, schemas + + +if __name__ == '__main__': + + """ + Used to create the PyTorch pretrained weights checkpoints + See .sh file of the same name for args used. + + See zoobot/pytorch/examples/minimal_examples.py for a friendlier example + """ + parser = argparse.ArgumentParser() + parser.add_argument('--save-dir', dest='save_dir', type=str) + # parser.add_argument('--data-dir', dest='data_dir', type=str) + # parser.add_argument('--dataset', dest='dataset', type=str, help='dataset to use, either "gz_decals_dr5" or "gz_evo"') + parser.add_argument('--architecture', dest='architecture_name', default='efficientnet', type=str) + parser.add_argument('--resize-after-crop', dest='resize_after_crop', + type=int, default=224) + parser.add_argument('--color', default=False, action='store_true') + parser.add_argument('--batch-size', dest='batch_size', + default=256, type=int) + parser.add_argument('--gpus', dest='gpus', default=1, type=int) + parser.add_argument('--nodes', dest='nodes', default=1, type=int) + parser.add_argument('--mixed-precision', dest='mixed_precision', + default=False, action='store_true') + parser.add_argument('--debug', dest='debug', + default=False, action='store_true') + parser.add_argument('--wandb', dest='wandb', + default=False, action='store_true') + parser.add_argument('--seed', dest='random_state', default=42, type=int) + args = parser.parse_args() + + """ + debug + python only_for_me/narval/train.py --save-dir only_for_me/narval/debug_models --batch-size 32 --color + """ + + logging.basicConfig(level=logging.INFO) + + random_state = args.random_state + + # if args.nodes > 1: + # # at Manchester, our slurm cluster sets TASKS not NTASKS, which then confuses lightning + # if 'SLURM_NTASKS_PER_NODE' not in os.environ.keys(): + # os.environ['SLURM_NTASKS_PER_NODE'] = os.environ['SLURM_TASKS_PER_NODE'] + # # log the rest to help debug + # logging.info([(x, y) for (x, y) in os.environ.items() if 'SLURM' in x]) + + if args.debug: + download = False + else: + # download = True # for first use + download = False # for speed afterwards + + if os.path.isdir('/home/walml/repos/zoobot'): + search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' + + else: + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/gz_decals_5/gz_decals_5_train_*.tar' + + all_urls = glob.glob(search_str) + assert len(all_urls) > 0, search_str + train_urls, val_urls = all_urls[:6], all_urls[6:] + schema = schemas.decals_dr5_ortho_schema + + + # if args.dataset == 'gz_decals_dr5': + # schema, (train_catalog, val_catalog, test_catalog) = benchmark_datasets.get_gz_decals_dr5_benchmark_dataset(args.data_dir, random_state, download=download) + # elif args.dataset == 'gz_evo': + # schema, (train_catalog, val_catalog, test_catalog) = benchmark_datasets.get_gz_evo_benchmark_dataset(args.data_dir, random_state, download=download) + # else: + # raise ValueError(f'Dataset {args.dataset} not recognised: should be "gz_decals_dr5" or "gz_evo"') + + + # logging.info('First val galaxy: {}'.format(val_catalog.iloc[0]['id_str'])) + + + # debug mode + if args.debug: + logging.warning( + 'Using debug mode: cutting urls down to 2') + train_urls = train_urls[:2] + val_urls = val_urls[:2] + epochs = 2 + else: + epochs = 1000 + + if args.wandb: + wandb_logger = WandbLogger( + project='narval', + name=os.path.basename(args.save_dir), + log_model=False + ) + else: + wandb_logger = None + + train_with_pytorch_lightning.train_default_zoobot_from_scratch( + save_dir=args.save_dir, + schema=schema, + train_urls = train_urls, + val_urls = val_urls, + test_urls = None, + architecture_name=args.architecture_name, + batch_size=args.batch_size, + epochs=epochs, # rely on early stopping + patience=10, + # augmentation parameters + color=args.color, + resize_after_crop=args.resize_after_crop, + # hardware parameters + gpus=args.gpus, + nodes=args.nodes, + mixed_precision=args.mixed_precision, + wandb_logger=wandb_logger, + prefetch_factor=4, + num_workers=11, # system has 24 cpu, 12 cpu per gpu, leave a little wiggle room + random_state=random_state, + learning_rate=1e-3, + ) + + wandb.finish() \ No newline at end of file diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 5690e0e1..9fdaf83c 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -11,6 +11,7 @@ from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule from zoobot.pytorch.estimators import define_model +from zoobot.pytorch.training import webdatamodule def train_default_zoobot_from_scratch( @@ -22,6 +23,9 @@ def train_default_zoobot_from_scratch( train_catalog=None, val_catalog=None, test_catalog=None, + train_urls=None, + val_urls=None, + test_urls=None, # training time parameters epochs=1000, patience=8, @@ -167,22 +171,6 @@ def train_default_zoobot_from_scratch( Suggest reducing num_workers.""" ) - - if catalog is not None: - assert train_catalog is None - assert val_catalog is None - assert test_catalog is None - catalogs_to_use = { - 'catalog': catalog - } - else: - assert catalog is None - catalogs_to_use = { - 'train_catalog': train_catalog, - 'val_catalog': val_catalog, - 'test_catalog': test_catalog # may be None - } - if wandb_logger is not None: wandb_logger.log_hyperparams({ 'random_state': random_state, @@ -201,20 +189,50 @@ def train_default_zoobot_from_scratch( 'framework': 'pytorch' }) - datamodule = GalaxyDataModule( - label_cols=schema.label_cols, - # can take either a catalog (and split it), or a pre-split catalog - **catalogs_to_use, - # augmentations parameters - greyscale=not color, - crop_scale_bounds=crop_scale_bounds, - crop_ratio_bounds=crop_ratio_bounds, - resize_after_crop=resize_after_crop, - # hardware parameters - batch_size=batch_size, # on 2xA100s, 256 with DDP, 512 with distributed (i.e. split batch) - num_workers=num_workers, - prefetch_factor=prefetch_factor - ) + # work out what dataset the user has passed + single_catalog = catalog is not None + split_catalogs = train_catalog is not None + webdatasets = train_urls is not None + + if single_catalog or split_catalogs: + # this branch will use GalaxyDataModule to load catalogs + assert not webdatasets + if single_catalog: + assert not split_catalogs + data_to_use = { + 'catalog': catalog + } + else: + data_to_use = { + 'train_catalog': train_catalog, + 'val_catalog': val_catalog, + 'test_catalog': test_catalog # may be None + } + datamodule = GalaxyDataModule( + label_cols=schema.label_cols, + # can take either a catalog (and split it), or a pre-split catalog + **data_to_use, + # augmentations parameters + greyscale=not color, + crop_scale_bounds=crop_scale_bounds, + crop_ratio_bounds=crop_ratio_bounds, + resize_after_crop=resize_after_crop, + # hardware parameters + batch_size=batch_size, # on 2xA100s, 256 with DDP, 512 with distributed (i.e. split batch) + num_workers=num_workers, + prefetch_factor=prefetch_factor + ) + else: + # this branch will use WebDataModule to load premade webdatasets + datamodule = webdatamodule.WebDataModule( + train_urls=train_urls, + val_urls=val_urls, + batch_size=batch_size, + num_workers=num_workers, + label_cols=schema.label_cols + # TODO pass through the rest + ) + datamodule.setup(stage='fit') # these args are automatically logged diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py new file mode 100644 index 00000000..bf1fb91b --- /dev/null +++ b/zoobot/pytorch/training/webdatamodule.py @@ -0,0 +1,130 @@ +import os + +import torch.utils.data +import numpy as np +import pytorch_lightning as pl + +import webdataset as wds + +from galaxy_datasets.transforms import default_transforms + +# https://github.com/webdataset/webdataset-lightning/blob/main/train.py +class WebDataModule(pl.LightningDataModule): + def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4): + super().__init__() + self.train_urls = train_urls + self.val_urls = val_urls + + if train_size is None: + # assume the size of each shard is encoded in the filename as ..._{size}.tar + train_size = sum([int(url.rstrip('.tar').split('_')[-1]) for url in train_urls]) + if val_size is None: + val_size = sum([int(url.rstrip('.tar').split('_')[-1]) for url in val_urls]) + + self.train_size = train_size + self.val_size = val_size + + self.label_cols = label_cols + + self.batch_size = batch_size + self.num_workers = num_workers + + print("train_urls = ", self.train_urls) + print("val_urls = ", self.val_urls) + print("train_size = ", self.train_size) + print("val_size = ", self.val_size) + print("batch_size", self.batch_size, "num_workers", self.num_workers) + + def make_image_transform(self, mode="train"): + # if mode == "train": + # elif mode == "val": + + augmentation_transform = default_transforms() # A.Compose object + def do_transform(img): + return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) + return do_transform + + def make_label_transform(self): + if self.label_cols is not None: + def label_transform(label_dict): + return torch.from_numpy(np.array([label_dict.get(col, 0) for col in self.label_cols])) + return label_transform + else: + return identity # do nothing + + def make_loader(self, urls, mode="train"): + if mode == "train": + dataset_size = self.train_size + shuffle = 5000 + elif mode == "val": + dataset_size = self.val_size + shuffle = 0 + + transform_image = self.make_image_transform(mode=mode) + + transform_label = self.make_label_transform() + + dataset = ( + # https://webdataset.github.io/webdataset/multinode/ + # WDS 'knows' which worker it is running on and selects a subset of urls accordingly + wds.WebDataset(urls) + .shuffle(shuffle) + .decode("rgb") + .to_tuple('image.jpg', 'labels.json') + .map_tuple(transform_image, transform_label) + # torch collate stacks dicts nicely while webdataset only lists them + # so use the torch collate instead + .batched(self.batch_size, torch.utils.data.default_collate, partial=False) + ) + + # from itertools import islice + # for batch in islice(dataset, 0, 3): + # images, labels = batch + # # print(len(sample)) + # print(images.shape) + # print(len(labels)) # list of dicts + # # exit() + + loader = wds.WebLoader( + dataset, + batch_size=None, # already batched + shuffle=False, + num_workers=self.num_workers, + ) + + # print('sampling') + # for sample in islice(loader, 0, 3): + # images, labels = sample + # print(images.shape) + # print(len(labels)) # list of dicts + # exit() + + loader.length = dataset_size // self.batch_size + + # temp hack instead + assert dataset_size % self.batch_size == 0 + # if mode == "train": + # ensure same number of batches in all clients + # loader = loader.ddp_equalize(dataset_size // self.batch_size) + # print("# loader length", len(loader)) + + return loader + + def train_dataloader(self): + return self.make_loader(self.train_urls, mode="train") + + def val_dataloader(self): + return self.make_loader(self.val_urls, mode="val") + + # @staticmethod + # def add_loader_specific_args(parser): + # parser.add_argument("-b", "--batch-size", type=int, default=128) + # parser.add_argument("--workers", type=int, default=6) + # parser.add_argument("--bucket", default="./shards") + # parser.add_argument("--shards", default="imagenet-train-{000000..001281}.tar") + # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") + # return parser + + +def identity(x): + return x \ No newline at end of file From 6118ee0fd00387e8016c9fc2eb8ed77253c293b5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:29:10 -0400 Subject: [PATCH 036/307] non-gpu version --- only_for_me/narval/train.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 only_for_me/narval/train.sh diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh new file mode 100644 index 00000000..9ab4851b --- /dev/null +++ b/only_for_me/narval/train.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --mem=32G +#SBATCH --nodes=1 +#SBATCH --time=0:20:0 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=4 + +### SBATCH --gres=gpu:a100:1 + +nvidia-smi + +PYTHON=/home/walml/envs/zoobot39_dev/bin/python + +cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ + +ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist + +export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. +# echo "r$SLURM_NODEID master: $MASTER_ADDR" +# echo "r$SLURM_NODEID Launching python script" + +REPO_DIR=/project/def-bovy/walml/zoobot/ +srun $PYTHON $REPO_DIR/only_for_me/narval/train.py --save-dir $REPO_DIR/only_for_me/narval/debug_models --batch-size 4 --color --debug +# srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 6e0c87e49c123c0b23a7d6e4760a0ac953511c4c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:37:03 -0400 Subject: [PATCH 037/307] cache --- only_for_me/narval/train.py | 1 + zoobot/pytorch/training/train_with_pytorch_lightning.py | 4 +++- zoobot/pytorch/training/webdatamodule.py | 6 ++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index eb616450..aac758f6 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -125,6 +125,7 @@ num_workers=11, # system has 24 cpu, 12 cpu per gpu, leave a little wiggle room random_state=random_state, learning_rate=1e-3, + cache_dir=os.environ['SLURM_TMPDIR'] ) wandb.finish() \ No newline at end of file diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 9fdaf83c..7b23c0f9 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -26,6 +26,7 @@ def train_default_zoobot_from_scratch( train_urls=None, val_urls=None, test_urls=None, + cache_dir=None, # only works with webdataset urls # training time parameters epochs=1000, patience=8, @@ -229,7 +230,8 @@ def train_default_zoobot_from_scratch( val_urls=val_urls, batch_size=batch_size, num_workers=num_workers, - label_cols=schema.label_cols + label_cols=schema.label_cols, + cache_dir=cache_dir # TODO pass through the rest ) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index bf1fb91b..93a3c553 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -10,7 +10,7 @@ # https://github.com/webdataset/webdataset-lightning/blob/main/train.py class WebDataModule(pl.LightningDataModule): - def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4): + def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, cache_dir=None): super().__init__() self.train_urls = train_urls self.val_urls = val_urls @@ -29,6 +29,8 @@ def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_c self.batch_size = batch_size self.num_workers = num_workers + self.cache_dir = cache_dir + print("train_urls = ", self.train_urls) print("val_urls = ", self.val_urls) print("train_size = ", self.train_size) @@ -67,7 +69,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') From da64982964dfb48e693335320f032180ce710f76 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:42:47 -0400 Subject: [PATCH 038/307] cache tweak --- only_for_me/narval/requirements.txt | 1 + only_for_me/narval/train.py | 5 +++-- only_for_me/narval/train.sh | 4 +--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/requirements.txt b/only_for_me/narval/requirements.txt index 9f22e9df..0daeb251 100644 --- a/only_for_me/narval/requirements.txt +++ b/only_for_me/narval/requirements.txt @@ -12,6 +12,7 @@ astropy pandas matplotlib fastparquet # replacing pyarrow below +webdataset # below already required by packages above # tqdm diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index aac758f6..c90d67c2 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -117,7 +117,8 @@ color=args.color, resize_after_crop=args.resize_after_crop, # hardware parameters - gpus=args.gpus, + # gpus=args.gpus, + gpus=0, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, @@ -125,7 +126,7 @@ num_workers=11, # system has 24 cpu, 12 cpu per gpu, leave a little wiggle room random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' ) wandb.finish() \ No newline at end of file diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 9ab4851b..1299b1da 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -11,9 +11,7 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ - -ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist +mkdir $SLURM_TMPDIR/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From 4944a7080e3c4f6779552f774ff4dd330978c002 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:47:49 -0400 Subject: [PATCH 039/307] gpu now --- only_for_me/narval/train.py | 3 ++- only_for_me/narval/train.sh | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index c90d67c2..334404c8 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=0, + gpus=1, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, @@ -127,6 +127,7 @@ random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) wandb.finish() \ No newline at end of file diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1299b1da..8444a0db 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,9 +1,9 @@ #!/bin/bash -#SBATCH --mem=32G +#SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=4 +#SBATCH --cpus-per-task=12 ### SBATCH --gres=gpu:a100:1 @@ -19,5 +19,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID Launching python script" REPO_DIR=/project/def-bovy/walml/zoobot/ -srun $PYTHON $REPO_DIR/only_for_me/narval/train.py --save-dir $REPO_DIR/only_for_me/narval/debug_models --batch-size 4 --color --debug +srun $PYTHON $REPO_DIR/only_for_me/narval/train.py --save-dir $REPO_DIR/only_for_me/narval/debug_models --batch-size 256 --color --debug # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From c727c25afd2b7a83cd79dbdc9af8295d5205805b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:48:50 -0400 Subject: [PATCH 040/307] typo --- only_for_me/narval/train.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 8444a0db..20382ee5 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -4,8 +4,7 @@ #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=12 - -### SBATCH --gres=gpu:a100:1 +#SBATCH --gres=gpu:a100:1 nvidia-smi From f6dcf9adaa80f3e1ddb51209dff9ce87100d3c80 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 17:59:44 -0400 Subject: [PATCH 041/307] mixed precision --- only_for_me/narval/gz_decals_webdataset.py | 9 +++++---- only_for_me/narval/train.sh | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/gz_decals_webdataset.py b/only_for_me/narval/gz_decals_webdataset.py index 4abce9eb..ae416774 100644 --- a/only_for_me/narval/gz_decals_webdataset.py +++ b/only_for_me/narval/gz_decals_webdataset.py @@ -81,13 +81,14 @@ def do_transform(img): def main(): train_catalog, _ = gz_decals_5(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=False, train=True) - - train_catalog = train_catalog[:512*64] + # print(len(train_catalog)) + # exit() + train_catalog = train_catalog[:88*2048] label_cols = label_metadata.decals_dr5_ortho_label_cols - save_loc = "gz_decals_5_train.tar" + save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz_decals_5/gz_decals_5_train.tar" - # df_to_wds(train_catalog, label_cols, save_loc, n_shards=8) + df_to_wds(train_catalog, label_cols, save_loc, n_shards=44) # check_wds(save_loc) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 20382ee5..d7e1e502 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -4,7 +4,7 @@ #SBATCH --time=0:20:0 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:1 +#SBATCH --gres=gpu:a100:2 nvidia-smi @@ -18,5 +18,8 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID Launching python script" REPO_DIR=/project/def-bovy/walml/zoobot/ -srun $PYTHON $REPO_DIR/only_for_me/narval/train.py --save-dir $REPO_DIR/only_for_me/narval/debug_models --batch-size 256 --color --debug +srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ + --save-dir $REPO_DIR/only_for_me/narval/debug_models \ + --batch-size 512 \ + --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 9be727b43c62507859e919fe7aabf93b0b126000 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 18:07:47 -0400 Subject: [PATCH 042/307] 2 --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index d7e1e502..c3bbdd61 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,7 +2,7 @@ #SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:20:0 -#SBATCH --tasks-per-node=1 +#SBATCH --tasks-per-node=2 #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 From 6f3f355013402df81c081eea53d6f6764d0d164e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 18:11:08 -0400 Subject: [PATCH 043/307] full dataset as it arrives --- only_for_me/narval/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 334404c8..477f8535 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -65,7 +65,7 @@ search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/gz_decals_5/gz_decals_5_train_*.tar' + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/gz_decals_5/full/gz_decals_5_train_*.tar' all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str From c43cf30d28b3acdeb4e710c14f56f49cf998632f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 19:02:52 -0400 Subject: [PATCH 044/307] OOM only on one gpu, weird --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 477f8535..3fe7c516 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -22,7 +22,7 @@ parser.add_argument('--save-dir', dest='save_dir', type=str) # parser.add_argument('--data-dir', dest='data_dir', type=str) # parser.add_argument('--dataset', dest='dataset', type=str, help='dataset to use, either "gz_decals_dr5" or "gz_evo"') - parser.add_argument('--architecture', dest='architecture_name', default='efficientnet', type=str) + parser.add_argument('--architecture', dest='architecture_name', default='efficientnet_b0', type=str) parser.add_argument('--resize-after-crop', dest='resize_after_crop', type=int, default=224) parser.add_argument('--color', default=False, action='store_true') @@ -69,7 +69,7 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str - train_urls, val_urls = all_urls[:6], all_urls[6:] + train_urls, val_urls = all_urls[:38], all_urls[38:] schema = schemas.decals_dr5_ortho_schema diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c3bbdd61..1c3c9533 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -20,6 +20,6 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 512 \ + --batch-size 256 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From c65fad910a72d015b7089167b3844bc2758127ba Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 19:08:29 -0400 Subject: [PATCH 045/307] 2 gpus --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 3fe7c516..e95a53f8 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=1, + gpus=2, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1c3c9533..c3bbdd61 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -20,6 +20,6 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 256 \ + --batch-size 512 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From a62175539d8b3c1cad57ef9f11595b127ec0e55e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 19:34:23 -0400 Subject: [PATCH 046/307] add nodesplitter --- only_for_me/narval/gz_decals_webdataset.py | 2 +- zoobot/pytorch/training/webdatamodule.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/gz_decals_webdataset.py b/only_for_me/narval/gz_decals_webdataset.py index ae416774..06045dd3 100644 --- a/only_for_me/narval/gz_decals_webdataset.py +++ b/only_for_me/narval/gz_decals_webdataset.py @@ -88,7 +88,7 @@ def main(): save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz_decals_5/gz_decals_5_train.tar" - df_to_wds(train_catalog, label_cols, save_loc, n_shards=44) + # df_to_wds(train_catalog, label_cols, save_loc, n_shards=44) # check_wds(save_loc) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 93a3c553..a8cc1c35 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -53,6 +53,7 @@ def label_transform(label_dict): return label_transform else: return identity # do nothing + def make_loader(self, urls, mode="train"): if mode == "train": @@ -69,7 +70,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -127,6 +128,13 @@ def val_dataloader(self): # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") # return parser +def nodesplitter_func(urls): + try: + node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() + return urls[node_id::node_count] + except RuntimeError: + print('Distributed not initialised. Hopefully single node.') + return urls def identity(x): return x \ No newline at end of file From fdfc84dbbb83184c68910c94c65c1ea365f8b7ed Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 20:42:09 -0400 Subject: [PATCH 047/307] unpack generators --- zoobot/pytorch/training/webdatamodule.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index a8cc1c35..ae23d2d6 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -1,4 +1,5 @@ import os +import types import torch.utils.data import numpy as np @@ -12,6 +13,11 @@ class WebDataModule(pl.LightningDataModule): def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, cache_dir=None): super().__init__() + + if isinstance(train_urls, types.GeneratorType): + train_urls = list(train_urls) + if isinstance(val_urls, types.GeneratorType): + val_urls = list(val_urls) self.train_urls = train_urls self.val_urls = val_urls From f7aeac493553e5d18ab5003332c40240393d1327 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 3 Nov 2023 23:46:09 -0400 Subject: [PATCH 048/307] unpack generator --- zoobot/pytorch/training/webdatamodule.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index ae23d2d6..dd01bad6 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -14,10 +14,10 @@ class WebDataModule(pl.LightningDataModule): def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, cache_dir=None): super().__init__() - if isinstance(train_urls, types.GeneratorType): - train_urls = list(train_urls) - if isinstance(val_urls, types.GeneratorType): - val_urls = list(val_urls) + # if isinstance(train_urls, types.GeneratorType): + # train_urls = list(train_urls) + # if isinstance(val_urls, types.GeneratorType): + # val_urls = list(val_urls) self.train_urls = train_urls self.val_urls = val_urls @@ -135,9 +135,10 @@ def val_dataloader(self): # return parser def nodesplitter_func(urls): + print(urls) try: node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() - return urls[node_id::node_count] + return list(urls)[node_id::node_count] except RuntimeError: print('Distributed not initialised. Hopefully single node.') return urls From 1b2e2f21faba039284a24698aac0674b53688091 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 09:50:43 -0400 Subject: [PATCH 049/307] try repeat and pin memory --- only_for_me/narval/train.sh | 2 +- zoobot/pytorch/training/webdatamodule.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c3bbdd61..0c5a1121 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --mem=80G #SBATCH --nodes=1 -#SBATCH --time=0:20:0 +#SBATCH --time=0:40:0 #SBATCH --tasks-per-node=2 #SBATCH --cpus-per-task=12 #SBATCH --gres=gpu:a100:2 diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index dd01bad6..a359c635 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -83,7 +83,8 @@ def make_loader(self, urls, mode="train"): .map_tuple(transform_image, transform_label) # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead - .batched(self.batch_size, torch.utils.data.default_collate, partial=False) + .batched(self.batch_size, torch.utils.data.default_collate, partial=False) + .repeat(2) ) # from itertools import islice @@ -97,8 +98,9 @@ def make_loader(self, urls, mode="train"): loader = wds.WebLoader( dataset, batch_size=None, # already batched - shuffle=False, + shuffle=False, # already shuffled num_workers=self.num_workers, + pin_memory=True ) # print('sampling') @@ -134,8 +136,8 @@ def val_dataloader(self): # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") # return parser -def nodesplitter_func(urls): - print(urls) +def nodesplitter_func(urls): # SimpleShardList + # print(urls) try: node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() return list(urls)[node_id::node_count] From 685e0a875167ebc8968e943624bdb04591d89a1b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 10:17:53 -0400 Subject: [PATCH 050/307] increase prefetch factor --- zoobot/pytorch/training/webdatamodule.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index a359c635..6449af55 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -100,7 +100,8 @@ def make_loader(self, urls, mode="train"): batch_size=None, # already batched shuffle=False, # already shuffled num_workers=self.num_workers, - pin_memory=True + pin_memory=True, + prefetch_factor=10 ) # print('sampling') From fa78fc3479055249eafe3e4aea00abadb9683f59 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 18:07:33 -0400 Subject: [PATCH 051/307] prefetch 4 --- zoobot/pytorch/training/webdatamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 6449af55..e75aa986 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -101,7 +101,7 @@ def make_loader(self, urls, mode="train"): shuffle=False, # already shuffled num_workers=self.num_workers, pin_memory=True, - prefetch_factor=10 + prefetch_factor=4 ) # print('sampling') From fa5ec259da4cc24921400a7792a0636cdcba9eb8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 18:53:06 -0400 Subject: [PATCH 052/307] try 4 gpus --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 0c5a1121..a79bc468 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,9 +2,9 @@ #SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=2 +#SBATCH --tasks-per-node=4 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:2 +#SBATCH --gres=gpu:a100:4 nvidia-smi From 38ad66668ffc6a6a17baec36866b00777c844f07 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 18:55:01 -0400 Subject: [PATCH 053/307] 4 gpu --- only_for_me/narval/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index e95a53f8..d93b65df 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=2, + gpus=4, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, From c6b41d670156106e624d01fdb286063c48787603 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 19:24:24 -0400 Subject: [PATCH 054/307] request more mem --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a79bc468..f8e95afc 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --mem=80G +#SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=0:40:0 #SBATCH --tasks-per-node=4 From 51d9ad6f01d41866ae5a0a2b4ab3d38c83708146 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 19:33:50 -0400 Subject: [PATCH 055/307] 4 gpu with ramdisk --- only_for_me/narval/train.py | 3 ++- only_for_me/narval/train.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index d93b65df..8158f267 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,7 +126,8 @@ num_workers=11, # system has 24 cpu, 12 cpu per gpu, leave a little wiggle room random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f8e95afc..fc03f575 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -10,7 +10,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -mkdir $SLURM_TMPDIR/cache +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From 4941c151ce21c9a154c4306511f50f702d76a8a6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:04:35 -0400 Subject: [PATCH 056/307] 1 gpu baseline ramdisk --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 4 ++-- zoobot/pytorch/training/webdatamodule.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 8158f267..846b88e0 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=4, + gpus=1, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index fc03f575..9be5eda5 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,9 +2,9 @@ #SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=4 +#SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:4 +#SBATCH --gres=gpu:a100:1 nvidia-smi diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index e75aa986..f1a94680 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -84,7 +84,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - .repeat(2) + .repeat(5) ) # from itertools import islice From b011cd93acc2fe78df6cb4835fec4fb55ee1ff2c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:05:01 -0400 Subject: [PATCH 057/307] bs --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 9be5eda5..301edafb 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,6 +21,6 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 512 \ + --batch-size 256 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 5a58383637c24dcef73988386ea517a0823cc9bc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:25:55 -0400 Subject: [PATCH 058/307] fewer num_workers, 4 gpu --- only_for_me/narval/train.py | 8 ++++---- only_for_me/narval/train.sh | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 846b88e0..a0faaebb 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,16 +118,16 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=1, + gpus=4, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=4, - num_workers=11, # system has 24 cpu, 12 cpu per gpu, leave a little wiggle room + num_workers=6, random_state=random_state, learning_rate=1e-3, - # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - cache_dir='/tmp/cache' + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 301edafb..eba99a94 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,16 +2,16 @@ #SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=1 +#SBATCH --tasks-per-node=4 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:1 +#SBATCH --gres=gpu:a100:4 nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -# mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +mkdir $SLURM_TMPDIR/cache +# mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. @@ -21,6 +21,6 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 256 \ + --batch-size 512 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From d380ef0887049ce998227257198c41228b309f1d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:35:03 -0400 Subject: [PATCH 059/307] 4 gpu 6 workers ramdisk --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index a0faaebb..25c755e2 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=6, random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - # cache_dir='/tmp/cache' + # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index eba99a94..fc03f575 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -10,8 +10,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From 0834d7fce28bde9daca118446c8f3d3e3840915d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:42:51 -0400 Subject: [PATCH 060/307] maxvit 4 gpu 6 worker ssd --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 25c755e2..a0faaebb 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=6, random_state=random_state, learning_rate=1e-3, - # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - cache_dir='/tmp/cache' + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index fc03f575..3fff67f9 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -10,8 +10,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -# mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +mkdir $SLURM_TMPDIR/cache +# mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. @@ -21,6 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 512 \ + --batch-size 128 \ + --architecture maxvit_tiny_224 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From b6e250370b54baf65a8a6decd0dda3087d5cd238 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 4 Nov 2023 20:54:48 -0400 Subject: [PATCH 061/307] typo --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index a0faaebb..25c755e2 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=6, random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - # cache_dir='/tmp/cache' + # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 3fff67f9..f5801794 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -10,8 +10,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. @@ -22,6 +22,6 @@ REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 128 \ - --architecture maxvit_tiny_224 \ + --architecture maxvit_tiny_tf_224 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From dc1ddef65663531a0df388d43e764f62ee16a6ef Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 5 Nov 2023 10:23:38 -0500 Subject: [PATCH 062/307] maxvit small 64b norepeat --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 4 ++-- zoobot/pytorch/training/webdatamodule.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 25c755e2..915a87f0 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -36,7 +36,7 @@ default=False, action='store_true') parser.add_argument('--wandb', dest='wandb', default=False, action='store_true') - parser.add_argument('--seed', dest='random_state', default=42, type=int) + parser.add_argument('--seed', dest='random_state', default=1, type=int) args = parser.parse_args() """ @@ -97,7 +97,7 @@ if args.wandb: wandb_logger = WandbLogger( project='narval', - name=os.path.basename(args.save_dir), + # name=os.path.basename(args.save_dir), log_model=False ) else: diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f5801794..c9d21c8b 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 128 \ - --architecture maxvit_tiny_tf_224 \ + --batch-size 64 \ + --architecture maxvit_small_tf_224 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index f1a94680..96f79c0b 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -84,7 +84,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - .repeat(5) + # .repeat(5) ) # from itertools import islice From cf24ded39bb06de6810a30bc1501c391e167a951 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 5 Nov 2023 12:38:07 -0500 Subject: [PATCH 063/307] single gpu baseline --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 11 ++++++----- zoobot/pytorch/training/webdatamodule.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 915a87f0..0cdfa539 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,11 +118,11 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=4, + gpus=1, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, - prefetch_factor=4, + prefetch_factor=6, num_workers=6, random_state=random_state, learning_rate=1e-3, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c9d21c8b..320eb96d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,9 +2,9 @@ #SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=4 +#SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:4 +#SBATCH --gres=gpu:a100:1 nvidia-smi @@ -13,7 +13,7 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python # mkdir $SLURM_TMPDIR/cache mkdir /tmp/cache -export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +# export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" @@ -21,7 +21,8 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 64 \ - --architecture maxvit_small_tf_224 \ + --batch-size 256 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py + + # --architecture maxvit_small_tf_224 \ diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 96f79c0b..f1a94680 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -84,7 +84,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - # .repeat(5) + .repeat(5) ) # from itertools import islice From 23ca029702a02dcb4c6d6d57155a6c140f63890c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 5 Nov 2023 19:20:10 -0500 Subject: [PATCH 064/307] b512 --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 320eb96d..a10dad29 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ mkdir /tmp/cache REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 256 \ + --batch-size 512 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From d1e6fa5a1e98851e6edf814d55cbb58f6706f911 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 11:43:26 -0500 Subject: [PATCH 065/307] back to 11 numworkers --- only_for_me/narval/gz_decals_webdataset.py | 20 ++++++++++++-------- only_for_me/narval/train.py | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/only_for_me/narval/gz_decals_webdataset.py b/only_for_me/narval/gz_decals_webdataset.py index 06045dd3..8a18a613 100644 --- a/only_for_me/narval/gz_decals_webdataset.py +++ b/only_for_me/narval/gz_decals_webdataset.py @@ -13,7 +13,7 @@ from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets from galaxy_datasets.shared import label_metadata -from galaxy_datasets import gz_decals_5 +from galaxy_datasets import gz2 from galaxy_datasets.transforms import default_transforms from galaxy_datasets.pytorch import galaxy_dataset @@ -34,7 +34,7 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols): } def df_to_wds(df: pd.DataFrame, label_cols, save_loc, n_shards): - df['id_str'] = df['id_str'].str.replace('.', '_') + df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') shard_dfs = np.array_split(df, n_shards) print('shards: ', len(shard_dfs)) @@ -80,15 +80,19 @@ def do_transform(img): def main(): - train_catalog, _ = gz_decals_5(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=False, train=True) + train_catalog, _ = gz2(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=True, train=True) # print(len(train_catalog)) # exit() - train_catalog = train_catalog[:88*2048] - label_cols = label_metadata.decals_dr5_ortho_label_cols - - save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz_decals_5/gz_decals_5_train.tar" + divisor = 4096 + batches = len(train_catalog) // divisor + print(batches) + train_catalog = train_catalog[:batches*divisor] + print(len(train_catalog)) + label_cols = label_metadata.gz2_ortho_label_cols + + save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz2/gz2_train.tar" - # df_to_wds(train_catalog, label_cols, save_loc, n_shards=44) + df_to_wds(train_catalog, label_cols, save_loc, n_shards=batches) # check_wds(save_loc) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 0cdfa539..2a584784 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -123,7 +123,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=6, + num_workers=11, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' From 666241df0fccecb1a5d70a28ae47c7b3631fdbe8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 12:18:14 -0500 Subject: [PATCH 066/307] smaller batch, still 11 workers --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a10dad29..320eb96d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ mkdir /tmp/cache REPO_DIR=/project/def-bovy/walml/zoobot/ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 512 \ + --batch-size 256 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From dece4ab05f014211789926949bc4d9d69697541e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 12:34:18 -0500 Subject: [PATCH 067/307] back to local disk, still 11 workers and b256 --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 2a584784..be36a0f8 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=11, random_state=random_state, learning_rate=1e-3, - # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - cache_dir='/tmp/cache' + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 320eb96d..094fbd94 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -10,8 +10,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -# mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +mkdir $SLURM_TMPDIR/cache +# mkdir /tmp/cache # export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From 1feb21528bfb2015a4f8202d67de939b1a45be30 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 13:06:59 -0500 Subject: [PATCH 068/307] seems like reserving is what helped. try 2x cpu --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 094fbd94..ceff355e 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,9 +1,9 @@ #!/bin/bash -#SBATCH --mem-per-cpu 4G +#SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:40:0 #SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=12 +#SBATCH --cpus-per-task=24 #SBATCH --gres=gpu:a100:1 nvidia-smi From aeb2150aa64f1ce2b9b24f71a53a1aa110df20f3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 13:20:44 -0500 Subject: [PATCH 069/307] add back NCCL, remove nodesplitter func --- only_for_me/narval/train.sh | 2 +- zoobot/pytorch/training/webdatamodule.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index ceff355e..be3032cd 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -13,7 +13,7 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python mkdir $SLURM_TMPDIR/cache # mkdir /tmp/cache -# export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index f1a94680..a965a777 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -76,7 +76,8 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0) + # , nodesplitter=nodesplitter_func) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -137,14 +138,14 @@ def val_dataloader(self): # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") # return parser -def nodesplitter_func(urls): # SimpleShardList - # print(urls) - try: - node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() - return list(urls)[node_id::node_count] - except RuntimeError: - print('Distributed not initialised. Hopefully single node.') - return urls +# def nodesplitter_func(urls): # SimpleShardList +# # print(urls) +# try: +# node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() +# return list(urls)[node_id::node_count] +# except RuntimeError: +# print('Distributed not initialised. Hopefully single node.') +# return urls def identity(x): return x \ No newline at end of file From fdc8472f99b6d37fbb66d5fd78a0c2c051076d0a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 13:38:10 -0500 Subject: [PATCH 070/307] either slurm, or .repeat --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 7 +++---- zoobot/pytorch/training/webdatamodule.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index be36a0f8..99fcd0d5 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -122,7 +122,7 @@ nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, - prefetch_factor=6, + prefetch_factor=4, num_workers=11, random_state=random_state, learning_rate=1e-3, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index be3032cd..f19488da 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,9 @@ #!/bin/bash -#SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=24 -#SBATCH --gres=gpu:a100:1 +#SBATCH --tasks-per-node=2 +#SBATCH --cpus-per-task=12 +#SBATCH --gres=gpu:a100:2 nvidia-smi diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index a965a777..be51cbc3 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -85,7 +85,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - .repeat(5) + # .repeat(5) ) # from itertools import islice From c32524a389211092d9e5b97e58ca8525af9b93fc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 13:43:00 -0500 Subject: [PATCH 071/307] mem --- only_for_me/narval/train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f19488da..bfd57175 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,4 +1,5 @@ #!/bin/bash +#SBATCH --mem=80G #SBATCH --nodes=1 #SBATCH --time=0:40:0 #SBATCH --tasks-per-node=2 From 1c0b220335da60a9f9d1d698c4b456842c7f4387 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 14:04:24 -0500 Subject: [PATCH 072/307] is it slurm, or repeat? --- zoobot/pytorch/training/webdatamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index be51cbc3..a965a777 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -85,7 +85,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - # .repeat(5) + .repeat(5) ) # from itertools import islice From 6c70398fe706964a1fc8e6ac0f73e004632eb80e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 15:14:47 -0500 Subject: [PATCH 073/307] revert to single gpu ramdisk 6 workers b256 --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 10 +++++----- zoobot/pytorch/training/webdatamodule.py | 24 ++++++++++++------------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 99fcd0d5..8eda01c1 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -122,8 +122,8 @@ nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, - prefetch_factor=4, - num_workers=11, + prefetch_factor=6, + num_workers=6, random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index bfd57175..d3f92cff 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,17 +1,17 @@ #!/bin/bash -#SBATCH --mem=80G +#SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=0:40:0 -#SBATCH --tasks-per-node=2 +#SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:2 +#SBATCH --gres=gpu:a100:1 nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index a965a777..8fd33d47 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -11,7 +11,7 @@ # https://github.com/webdataset/webdataset-lightning/blob/main/train.py class WebDataModule(pl.LightningDataModule): - def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, cache_dir=None): + def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, prefetch_factor=4, cache_dir=None): super().__init__() # if isinstance(train_urls, types.GeneratorType): @@ -34,6 +34,7 @@ def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_c self.batch_size = batch_size self.num_workers = num_workers + self.prefetch_factor = prefetch_factor self.cache_dir = cache_dir @@ -76,8 +77,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0) - # , nodesplitter=nodesplitter_func) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -102,7 +102,7 @@ def make_loader(self, urls, mode="train"): shuffle=False, # already shuffled num_workers=self.num_workers, pin_memory=True, - prefetch_factor=4 + prefetch_factor=self.prefetch_factor ) # print('sampling') @@ -138,14 +138,14 @@ def val_dataloader(self): # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") # return parser -# def nodesplitter_func(urls): # SimpleShardList -# # print(urls) -# try: -# node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() -# return list(urls)[node_id::node_count] -# except RuntimeError: -# print('Distributed not initialised. Hopefully single node.') -# return urls +def nodesplitter_func(urls): # SimpleShardList + # print(urls) + try: + node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() + return list(urls)[node_id::node_count] + except RuntimeError: + # print('Distributed not initialised. Hopefully single node.') + return urls def identity(x): return x \ No newline at end of file From 34b80e35dfb37db8e97415e144bd4930a3ac106a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 18:32:25 -0500 Subject: [PATCH 074/307] ramdisk --- only_for_me/narval/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 8eda01c1..0cdfa539 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=6, random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - # cache_dir='/tmp/cache' + # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) From 7adc16973ec5fb33969754613051dd32283dc7b3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 19:51:54 -0500 Subject: [PATCH 075/307] test on beluga --- only_for_me/narval/train.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index d3f92cff..0e88ad72 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash -#SBATCH --mem-per-cpu 4G +#SBATCH --mem-per-cpu 3G #SBATCH --nodes=1 -#SBATCH --time=0:40:0 +#SBATCH --time=0:50:0 #SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:1 +#SBATCH --cpus-per-task=10 +#SBATCH --gres=gpu:v100:1 nvidia-smi From 1d66218e5d06b4d793bde2bfbdc6c30aca8b6527 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 20:04:50 -0500 Subject: [PATCH 076/307] debug sympy --- only_for_me/narval/train.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 0e88ad72..098119e2 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -8,7 +8,8 @@ nvidia-smi -PYTHON=/home/walml/envs/zoobot39_dev/bin/python +# PYTHON=/home/walml/envs/zoobot39_dev/bin/python +source ~/envs/zoobot39_dev/bin/activate # mkdir $SLURM_TMPDIR/cache mkdir /tmp/cache From 0f605ed91526a15c065d5c3e0c91f93bb35e75d5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 21:46:18 -0500 Subject: [PATCH 077/307] longer run, more workers/cpus --- only_for_me/narval/narval.md | 1 + only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index b1fa5eea..8197dfab 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -25,6 +25,7 @@ Just for venv: Latest is currently 2.0.1 (no 2.1.0 yet) pip install --no-index torch==2.0.1 torchvision torchtext torchaudio + pip install --no-index pytorch_lightning wandb Storage under /home/user is not ideal, 50gb space. Use /project/def-bovy/walml (1TB space). Can transfer data via rsync login node. diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 0cdfa539..9d06dbfc 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -123,7 +123,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=6, + num_workers=12, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 098119e2..1c8dccf4 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,9 +1,9 @@ #!/bin/bash #SBATCH --mem-per-cpu 3G #SBATCH --nodes=1 -#SBATCH --time=0:50:0 +#SBATCH --time=2:50:0 #SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=10 +#SBATCH --cpus-per-task=20 #SBATCH --gres=gpu:v100:1 nvidia-smi From 6d3f52895ccd321e2f7668ac969fc952032db3f8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 21:54:45 -0500 Subject: [PATCH 078/307] oops slash --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1c8dccf4..a95e8275 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -19,7 +19,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" -REPO_DIR=/project/def-bovy/walml/zoobot/ +REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 256 \ From c6c1f39535825e1239f740e75d1366c1a95cf84b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 21:55:48 -0500 Subject: [PATCH 079/307] fine, normal python --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a95e8275..c528a4af 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -8,8 +8,8 @@ nvidia-smi -# PYTHON=/home/walml/envs/zoobot39_dev/bin/python -source ~/envs/zoobot39_dev/bin/activate +PYTHON=/home/walml/envs/zoobot39_dev/bin/python +# source ~/envs/zoobot39_dev/bin/activate # mkdir $SLURM_TMPDIR/cache mkdir /tmp/cache From 1e9ffc9f1342c26ae3ada5d148e8bc590c27935a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 22:00:45 -0500 Subject: [PATCH 080/307] try 4 gpu v100, 9 workers, ramdisk --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 9d06dbfc..21d939af 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,12 +118,12 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=1, + gpus=4, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=12, + num_workers=9, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c528a4af..eddd729c 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash -#SBATCH --mem-per-cpu 3G +#SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=2:50:0 -#SBATCH --tasks-per-node=1 -#SBATCH --cpus-per-task=20 -#SBATCH --gres=gpu:v100:1 +#SBATCH --tasks-per-node=4 +#SBATCH --cpus-per-task=10 +#SBATCH --gres=gpu:v100:4 nvidia-smi From d015e51c575e7fcdbfe40a2aafdaef9823fd1842 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 22:04:19 -0500 Subject: [PATCH 081/307] still 4 gpu, now on SSD instead --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 21d939af..cfbbcec9 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -126,8 +126,8 @@ num_workers=9, random_state=random_state, learning_rate=1e-3, - # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - cache_dir='/tmp/cache' + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index eddd729c..4df00301 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -11,8 +11,8 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python # source ~/envs/zoobot39_dev/bin/activate -# mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +mkdir $SLURM_TMPDIR/cache +# mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From df15eeb44cc650c9088fb9e7ad61f0b07d21beb3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 22:06:06 -0500 Subject: [PATCH 082/307] only 3 workers per --- only_for_me/narval/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index cfbbcec9..5dd6d604 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -123,7 +123,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=9, + num_workers=3, random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' From 53ce4192cce8f9075682c337d592d6f37f5533a0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 22:24:06 -0500 Subject: [PATCH 083/307] back to a100 big boy test --- only_for_me/narval/train.py | 6 +++--- only_for_me/narval/train.sh | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 5dd6d604..06f11f2d 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -123,11 +123,11 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=3, + num_workers=11, random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - # cache_dir='/tmp/cache' + # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 4df00301..0a5ea619 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -3,16 +3,16 @@ #SBATCH --nodes=1 #SBATCH --time=2:50:0 #SBATCH --tasks-per-node=4 -#SBATCH --cpus-per-task=10 -#SBATCH --gres=gpu:v100:4 +#SBATCH --cpus-per-task=12 +#SBATCH --gres=gpu:a100:4 nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python # source ~/envs/zoobot39_dev/bin/activate -mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. From 4dfbe9140037e7908c95d55003232d0a2f7b2434 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 6 Nov 2023 23:20:41 -0500 Subject: [PATCH 084/307] 2 a100 --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 06f11f2d..98847c92 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=4, + gpus=2, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 0a5ea619..3026a960 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,9 +2,9 @@ #SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=2:50:0 -#SBATCH --tasks-per-node=4 +#SBATCH --tasks-per-node=2 #SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:4 +#SBATCH --gres=gpu:a100:2 nvidia-smi From 8bc53443f7fafbfeed200b5f17d118a685d266c2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 09:57:34 -0500 Subject: [PATCH 085/307] debug nodesplitter --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 6 +++--- zoobot/pytorch/training/webdatamodule.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 98847c92..33fd9c5a 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -92,7 +92,7 @@ val_urls = val_urls[:2] epochs = 2 else: - epochs = 1000 + epochs = 1 if args.wandb: wandb_logger = WandbLogger( diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 3026a960..eddd729c 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,9 +2,9 @@ #SBATCH --mem-per-cpu 4G #SBATCH --nodes=1 #SBATCH --time=2:50:0 -#SBATCH --tasks-per-node=2 -#SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:2 +#SBATCH --tasks-per-node=4 +#SBATCH --cpus-per-task=10 +#SBATCH --gres=gpu:v100:4 nvidia-smi diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 8fd33d47..d79e7899 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -1,6 +1,6 @@ import os import types - +import logging import torch.utils.data import numpy as np import pytorch_lightning as pl @@ -142,7 +142,8 @@ def nodesplitter_func(urls): # SimpleShardList # print(urls) try: node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() - return list(urls)[node_id::node_count] + urls_to_use = list(urls)[node_id::node_count] + logging.info('id: {}, of count {}. \nURLS: {} ({})\n\n'.format(node_id, node_count, urls_to_use)) except RuntimeError: # print('Distributed not initialised. Hopefully single node.') return urls From ca96a8627aa1787daa8db300a7a4c8533f8169c7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:00:58 -0500 Subject: [PATCH 086/307] debug generator --- only_for_me/narval/train.py | 2 +- zoobot/pytorch/training/webdatamodule.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 33fd9c5a..e5c8bbcb 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -123,7 +123,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=11, + num_workers=9, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index d79e7899..c38fceb5 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -143,7 +143,7 @@ def nodesplitter_func(urls): # SimpleShardList try: node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() urls_to_use = list(urls)[node_id::node_count] - logging.info('id: {}, of count {}. \nURLS: {} ({})\n\n'.format(node_id, node_count, urls_to_use)) + logging.info(f'id: {node_id}, of count {node_count}. \nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n') except RuntimeError: # print('Distributed not initialised. Hopefully single node.') return urls From e0bd2db63876f632c000a1c50729a099e8c47c61 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:36:46 -0500 Subject: [PATCH 087/307] slurm tinkering --- only_for_me/narval/train.py | 2 ++ only_for_me/narval/train.sh | 7 ++++--- zoobot/pytorch/training/webdatamodule.py | 21 +++++++++++---------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index e5c8bbcb..f496766e 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -46,6 +46,8 @@ logging.basicConfig(level=logging.INFO) + logging.info(f'WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') + random_state = args.random_state # if args.nodes > 1: diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index eddd729c..5368bcec 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,9 +1,10 @@ #!/bin/bash -#SBATCH --mem-per-cpu 4G -#SBATCH --nodes=1 #SBATCH --time=2:50:0 -#SBATCH --tasks-per-node=4 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 +#SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:4 nvidia-smi diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index c38fceb5..d989d031 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -77,7 +77,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=wds.split_by_node) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -138,15 +138,16 @@ def val_dataloader(self): # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") # return parser -def nodesplitter_func(urls): # SimpleShardList - # print(urls) - try: - node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() - urls_to_use = list(urls)[node_id::node_count] - logging.info(f'id: {node_id}, of count {node_count}. \nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n') - except RuntimeError: - # print('Distributed not initialised. Hopefully single node.') - return urls +# def nodesplitter_func(urls): # SimpleShardList +# # print(urls) +# try: +# node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() +# urls_to_use = list(urls)[node_id::node_count] +# logging.info(f'id: {node_id}, of count {node_count}. \nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n') +# return urls_to_use +# except RuntimeError: +# # print('Distributed not initialised. Hopefully single node.') +# return urls def identity(x): return x \ No newline at end of file From c02c2eb0d94569dc0a702f26b972eb03f67f04dd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:39:33 -0500 Subject: [PATCH 088/307] oops, 4 gpu --- only_for_me/narval/train.py | 2 +- zoobot/pytorch/training/debug_split.ipynb | 138 ++++++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 zoobot/pytorch/training/debug_split.ipynb diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index f496766e..d65e7cab 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -120,7 +120,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=2, + gpus=4, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/zoobot/pytorch/training/debug_split.ipynb b/zoobot/pytorch/training/debug_split.ipynb new file mode 100644 index 00000000..1da819eb --- /dev/null +++ b/zoobot/pytorch/training/debug_split.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import webdataset as wds\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "world_size: processes participating in job (e.g. 4)\n", + "\n", + "rank: index of current process (e.g. 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['WORLD_SIZE'] = '4'" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['5', '9', '3']\n", + "['6', '2', '7']\n", + "['1', '4']\n", + "['8', '0']\n" + ] + } + ], + "source": [ + "for rank in range(4):\n", + " os.environ['RANK'] = str(rank)\n", + " print(list(wds.split_by_node({str(x) for x in range(10)})))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def nodesplitter_func(urls, node_id, node_count): # SimpleShardList\n", + " # print(urls)\n", + " # try:\n", + " # node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size()\n", + " urls_to_use = list(urls)[node_id::node_count]\n", + " print(f'id: {node_id}, of count {node_count}. \\nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\\n\\n')\n", + " # except RuntimeError:\n", + " # # print('Distributed not initialised. Hopefully single node.')\n", + " return urls_to_use\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 0, of count 4. \n", + "URLS: 3 of 10 (['5', '9', '3'])\n", + "\n", + "\n", + "['5', '9', '3']\n", + "id: 1, of count 4. \n", + "URLS: 3 of 10 (['6', '2', '7'])\n", + "\n", + "\n", + "['6', '2', '7']\n", + "id: 2, of count 4. \n", + "URLS: 2 of 10 (['1', '4'])\n", + "\n", + "\n", + "['1', '4']\n", + "id: 3, of count 4. \n", + "URLS: 2 of 10 (['8', '0'])\n", + "\n", + "\n", + "['8', '0']\n" + ] + } + ], + "source": [ + "for rank in range(4):\n", + " os.environ['RANK'] = str(rank)\n", + " print(nodesplitter_func({str(x) for x in range(10)}, rank, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 812b85789148ad2ad80695859e47e24fafbedbe0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:41:50 -0500 Subject: [PATCH 089/307] better logging --- only_for_me/narval/train.py | 2 -- zoobot/pytorch/training/webdatamodule.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index d65e7cab..3af6adaa 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -46,8 +46,6 @@ logging.basicConfig(level=logging.INFO) - logging.info(f'WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') - random_state = args.random_state # if args.nodes > 1: diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index d989d031..106b8f70 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -38,6 +38,9 @@ def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_c self.cache_dir = cache_dir + + logging.info(f'Creating webdatamodule with WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') + print("train_urls = ", self.train_urls) print("val_urls = ", self.val_urls) print("train_size = ", self.train_size) From 76b0819bf28292bb82e0cdf0a94c8bdbc4f1276f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:46:02 -0500 Subject: [PATCH 090/307] more logging --- zoobot/pytorch/training/webdatamodule.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 106b8f70..a976863e 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -80,7 +80,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=wds.split_by_node) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -153,4 +153,15 @@ def val_dataloader(self): # return urls def identity(x): - return x \ No newline at end of file + return x + +def nodesplitter_func(urls): + urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work + logging.info( + f''' + Splitting urls within webdatamodule with WORLD_SIZE: + {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}\n + URLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n) + ''' + ) + return urls_to_use From 60e0e1feeb7d66b06d3d1a4e7f652df26c240689 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 10:49:45 -0500 Subject: [PATCH 091/307] typo --- zoobot/pytorch/training/webdatamodule.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index a976863e..da4833f6 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -156,12 +156,13 @@ def identity(x): return x def nodesplitter_func(urls): + # num_urls = len(list(urls.copy())) urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work logging.info( f''' Splitting urls within webdatamodule with WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}\n - URLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n) + URLS: {len(urls_to_use)} ({urls_to_use})\n\n) ''' ) return urls_to_use From 9a821e47f4f66cee2640365b341acfbe329d63fc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:22:27 -0500 Subject: [PATCH 092/307] repeat=1 for debugging still, add replace_sampler_ddp flag --- zoobot/pytorch/training/debug_split.ipynb | 20 +++++++++++++++++ .../training/train_with_pytorch_lightning.py | 5 ++++- zoobot/pytorch/training/webdatamodule.py | 22 +++++++++++++++---- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/zoobot/pytorch/training/debug_split.ipynb b/zoobot/pytorch/training/debug_split.ipynb index 1da819eb..d536532d 100644 --- a/zoobot/pytorch/training/debug_split.ipynb +++ b/zoobot/pytorch/training/debug_split.ipynb @@ -106,6 +106,26 @@ " print(nodesplitter_func({str(x) for x in range(10)}, rank, 4))" ] }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3, 4, 0, 1)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wds.utils.pytorch_worker_info()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 7b23c0f9..761bfc96 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -223,6 +223,7 @@ def train_default_zoobot_from_scratch( num_workers=num_workers, prefetch_factor=prefetch_factor ) + replace_sampler_ddp=True else: # this branch will use WebDataModule to load premade webdatasets datamodule = webdatamodule.WebDataModule( @@ -234,6 +235,7 @@ def train_default_zoobot_from_scratch( cache_dir=cache_dir # TODO pass through the rest ) + replace_sampler_ddp=False datamodule.setup(stage='fit') @@ -284,7 +286,8 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins + plugins=plugins, + replace_sampler_ddp=replace_sampler_ddp ) logging.info((trainer.strategy, trainer.world_size, diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index da4833f6..da4e8e5b 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -4,6 +4,7 @@ import torch.utils.data import numpy as np import pytorch_lightning as pl +from itertools import islice import webdataset as wds @@ -88,7 +89,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - .repeat(5) + # .repeat(5) ) # from itertools import islice @@ -158,11 +159,24 @@ def identity(x): def nodesplitter_func(urls): # num_urls = len(list(urls.copy())) urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work - logging.info( + rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() + logging.debug( f''' Splitting urls within webdatamodule with WORLD_SIZE: - {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}\n - URLS: {len(urls_to_use)} ({urls_to_use})\n\n) + {world_size}, RANK: {rank}, WORKER: {worker} of {num_workers}\n + URLS: {len(urls_to_use)} (e.g. {urls_to_use[0]})\n\n) ''' ) return urls_to_use + + +# def split_by_worker(urls): +# rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() +# if num_workers > 1: +# logging.info(f'Slicing urls for rank {rank}, world_size {world_size}, worker {worker}') +# for s in islice(urls, worker, None, num_workers): +# yield s +# else: +# logging.warning('only one worker?!') +# for s in urls: +# yield s From c83714ce4a1f69e4a2c6b656ceac821ca8a04095 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:24:15 -0500 Subject: [PATCH 093/307] 2.1 update --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 761bfc96..14cf4584 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -223,7 +223,7 @@ def train_default_zoobot_from_scratch( num_workers=num_workers, prefetch_factor=prefetch_factor ) - replace_sampler_ddp=True + use_distributed_sampler=True else: # this branch will use WebDataModule to load premade webdatasets datamodule = webdatamodule.WebDataModule( @@ -235,7 +235,7 @@ def train_default_zoobot_from_scratch( cache_dir=cache_dir # TODO pass through the rest ) - replace_sampler_ddp=False + use_distributed_sampler=False datamodule.setup(stage='fit') @@ -287,7 +287,7 @@ def train_default_zoobot_from_scratch( max_epochs=epochs, default_root_dir=save_dir, plugins=plugins, - replace_sampler_ddp=replace_sampler_ddp + use_distributed_sampler= use_distributed_sampler ) logging.info((trainer.strategy, trainer.world_size, From da2438466f8ab76d793f8601f6ff641cdb7da8ec Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:26:43 -0500 Subject: [PATCH 094/307] typo --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 14cf4584..660dc939 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -223,7 +223,7 @@ def train_default_zoobot_from_scratch( num_workers=num_workers, prefetch_factor=prefetch_factor ) - use_distributed_sampler=True + use_distributed_sampler=True else: # this branch will use WebDataModule to load premade webdatasets datamodule = webdatamodule.WebDataModule( @@ -235,7 +235,7 @@ def train_default_zoobot_from_scratch( cache_dir=cache_dir # TODO pass through the rest ) - use_distributed_sampler=False + use_distributed_sampler=False datamodule.setup(stage='fit') @@ -287,7 +287,7 @@ def train_default_zoobot_from_scratch( max_epochs=epochs, default_root_dir=save_dir, plugins=plugins, - use_distributed_sampler= use_distributed_sampler + use_distributed_sampler=use_distributed_sampler ) logging.info((trainer.strategy, trainer.world_size, From 45d836c35b067fff06007c147b3a9faba6d1ce3f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:49:48 -0500 Subject: [PATCH 095/307] 2 gpu scaling --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 5368bcec..8e6f8265 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --time=2:50:0 #SBATCH --nodes=1 -#SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:2 nvidia-smi From ea0db1f9957c86f925ab3af4b182d1e988548430 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:50:07 -0500 Subject: [PATCH 096/307] 2 gpu --- only_for_me/narval/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 3af6adaa..e5c8bbcb 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -118,7 +118,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=4, + gpus=2, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, From a49904cfb41a18cdb947adc1d32ae9e2ca0b676a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 11:54:36 -0500 Subject: [PATCH 097/307] logging --- zoobot/pytorch/training/webdatamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index da4e8e5b..875d741c 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -160,7 +160,7 @@ def nodesplitter_func(urls): # num_urls = len(list(urls.copy())) urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() - logging.debug( + logging.info( f''' Splitting urls within webdatamodule with WORLD_SIZE: {world_size}, RANK: {rank}, WORKER: {worker} of {num_workers}\n From 0f67ab6e5992e18514e8eb98bba3e46814f12580 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 12:30:18 -0500 Subject: [PATCH 098/307] 1 gpu --- only_for_me/narval/train.py | 5 ++++- only_for_me/narval/train.sh | 6 +++--- zoobot/pytorch/training/debug_split.ipynb | 26 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index e5c8bbcb..6ce5b745 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -9,6 +9,8 @@ from zoobot.pytorch.training import train_with_pytorch_lightning from zoobot.shared import benchmark_datasets, schemas +import pytorch_lightning as pl + if __name__ == '__main__': @@ -47,6 +49,7 @@ logging.basicConfig(level=logging.INFO) random_state = args.random_state + pl.seed_everything(random_state) # if args.nodes > 1: # # at Manchester, our slurm cluster sets TASKS not NTASKS, which then confuses lightning @@ -118,7 +121,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=2, + gpus=1, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 8e6f8265..c90241d1 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --time=2:50:0 #SBATCH --nodes=1 -#SBATCH --ntasks=2 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:1 nvidia-smi diff --git a/zoobot/pytorch/training/debug_split.ipynb b/zoobot/pytorch/training/debug_split.ipynb index d536532d..954cb61a 100644 --- a/zoobot/pytorch/training/debug_split.ipynb +++ b/zoobot/pytorch/training/debug_split.ipynb @@ -126,6 +126,32 @@ "wds.utils.pytorch_worker_info()" ] }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['5', '2']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from itertools import islice\n", + "\n", + "def get_per_worker(urls={str(x) for x in range(10)}, worker_n=1, num_workers=5):\n", + " for s in islice(urls, worker_n, None, num_workers):\n", + " yield s\n", + "\n", + "list(get_per_worker(worker_n=0))" + ] + }, { "cell_type": "code", "execution_count": null, From 2d8c3bb6cf0d34badbaabd9a6cc22ca6be68f264 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 12:37:45 -0500 Subject: [PATCH 099/307] scaling of a100 2x repeat5x --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 8 ++++---- zoobot/pytorch/training/webdatamodule.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 6ce5b745..edc5402c 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -121,7 +121,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=1, + gpus=2, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c90241d1..e09549c0 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --time=2:50:0 +#SBATCH --time=1:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:a100:2 nvidia-smi diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/training/webdatamodule.py index 875d741c..40cab0cd 100644 --- a/zoobot/pytorch/training/webdatamodule.py +++ b/zoobot/pytorch/training/webdatamodule.py @@ -89,7 +89,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - # .repeat(5) + .repeat(5) ) # from itertools import islice From eb19ada98fe32288ab2ee48b7325aaf48b0ba7a1 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 7 Nov 2023 12:48:46 -0500 Subject: [PATCH 100/307] worked great, now 4x with the split --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index edc5402c..ef1029e6 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -121,7 +121,7 @@ resize_after_crop=args.resize_after_crop, # hardware parameters # gpus=args.gpus, - gpus=2, + gpus=4, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index e09549c0..4525aaa7 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --time=1:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks=2 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:a100:2 +#SBATCH --gres=gpu:a100:4 nvidia-smi From 9bad295a40aef84c925a910454b867aa38070f30 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 9 Nov 2023 17:33:02 -0500 Subject: [PATCH 101/307] add notes --- only_for_me/narval/narval.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md index b1fa5eea..08648b11 100644 --- a/only_for_me/narval/narval.md +++ b/only_for_me/narval/narval.md @@ -46,6 +46,7 @@ Set up repos (I made a .bashrc alias, export PROJECT=/project/def-bovy/walml) +git checkout narval-migration (from zoobot) pip install --no-index -r zoobot/only_for_me/narval/requirements.txt and my own cloned repos @@ -58,6 +59,9 @@ sbatch only_for_me/narval/finetune.sh Works with simple images on multi-GPU, single node +rsync -avz --no-g --no-p /home/walml/repos/zoobot/only_for_me/narval/gz_decals_5/*.tar walml@narval.computecanada.ca:projects/def-bovy/walml/data/webdatasets/gz_decals_5/full + + https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_2.html# https://pytorch.org/docs/stable/elastic/run.html#environment-variables From a5f00ed0bf3c3c1a39de135aa7058aeec662ac01 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 18 Nov 2023 13:38:55 -0500 Subject: [PATCH 102/307] refactor, add mock wds func --- only_for_me/narval/gpu_split.py | 22 ++++ only_for_me/narval/make_webdataset_script.py | 43 +++++++ only_for_me/narval/train.py | 21 +--- .../{training => datasets}/webdatamodule.py | 0 .../pytorch/datasets/webdataset_utils.py | 108 ++++++++---------- zoobot/pytorch/estimators/define_model.py | 23 ++++ .../training/train_with_pytorch_lightning.py | 2 +- 7 files changed, 139 insertions(+), 80 deletions(-) create mode 100644 only_for_me/narval/gpu_split.py create mode 100644 only_for_me/narval/make_webdataset_script.py rename zoobot/pytorch/{training => datasets}/webdatamodule.py (100%) rename only_for_me/narval/gz_decals_webdataset.py => zoobot/pytorch/datasets/webdataset_utils.py (60%) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py new file mode 100644 index 00000000..e42a743a --- /dev/null +++ b/only_for_me/narval/gpu_split.py @@ -0,0 +1,22 @@ +# import datetime + +import torch +from torch import nn +import torch.nn.functional as F + +import pytorch_lightning as pl + +# import torchvision +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 +from torch.utils.data import DataLoader + +import argparse + +def main(): + + shards = [f'shard_{x}' for x in range(4)] + + +if __name__=='__main__': + main() \ No newline at end of file diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py new file mode 100644 index 00000000..ad6cb428 --- /dev/null +++ b/only_for_me/narval/make_webdataset_script.py @@ -0,0 +1,43 @@ +import logging + +from itertools import islice +import glob + +from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets + +from galaxy_datasets.shared import label_metadata +from galaxy_datasets import gz2 + + +from zoobot.pytorch.datasets import webdataset_utils + + +def main(): + + logging.basicConfig(level=logging.INFO) + + train_catalog, _ = gz2(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=True, train=True) + + divisor = 4096 + n_shards = len(train_catalog) // divisor + logging.info(n_shards) + + train_catalog = train_catalog[:n_shards*divisor] + logging.info(len(train_catalog)) + label_cols = label_metadata.gz2_ortho_label_cols + + save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz2/gz2_train.tar" + + webdataset_utils.df_to_wds(train_catalog, label_cols, save_loc, n_shards=n_shards) + + # webdataset_utils.load_wds_directly(save_loc) + + # webdataset_utils.load_wds_with_augmentation(save_loc) + + webdataset_utils.load_wds_with_webdatamodule(save_loc, label_cols) + + +if __name__ == '__main__': + + main() + diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index ef1029e6..65ce44c0 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -58,12 +58,6 @@ # # log the rest to help debug # logging.info([(x, y) for (x, y) in os.environ.items() if 'SLURM' in x]) - if args.debug: - download = False - else: - # download = True # for first use - download = False # for speed afterwards - if os.path.isdir('/home/walml/repos/zoobot'): search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' @@ -75,18 +69,6 @@ train_urls, val_urls = all_urls[:38], all_urls[38:] schema = schemas.decals_dr5_ortho_schema - - # if args.dataset == 'gz_decals_dr5': - # schema, (train_catalog, val_catalog, test_catalog) = benchmark_datasets.get_gz_decals_dr5_benchmark_dataset(args.data_dir, random_state, download=download) - # elif args.dataset == 'gz_evo': - # schema, (train_catalog, val_catalog, test_catalog) = benchmark_datasets.get_gz_evo_benchmark_dataset(args.data_dir, random_state, download=download) - # else: - # raise ValueError(f'Dataset {args.dataset} not recognised: should be "gz_decals_dr5" or "gz_evo"') - - - # logging.info('First val galaxy: {}'.format(val_catalog.iloc[0]['id_str'])) - - # debug mode if args.debug: logging.warning( @@ -120,8 +102,7 @@ color=args.color, resize_after_crop=args.resize_after_crop, # hardware parameters - # gpus=args.gpus, - gpus=4, + gpus=args.gpus, nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, diff --git a/zoobot/pytorch/training/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py similarity index 100% rename from zoobot/pytorch/training/webdatamodule.py rename to zoobot/pytorch/datasets/webdatamodule.py diff --git a/only_for_me/narval/gz_decals_webdataset.py b/zoobot/pytorch/datasets/webdataset_utils.py similarity index 60% rename from only_for_me/narval/gz_decals_webdataset.py rename to zoobot/pytorch/datasets/webdataset_utils.py index 8a18a613..cc299b30 100644 --- a/only_for_me/narval/gz_decals_webdataset.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -1,7 +1,5 @@ import logging import os -import shutil -import sys import cv2 import json from itertools import islice @@ -19,6 +17,38 @@ import webdataset as wds +import zoobot.pytorch.datasets.webdatamodule as webdatamodule + + +def make_mock_wds(save_dir, label_cols, n_shards, shard_size): + shards = [os.path.join(save_dir, f'mock_shard_{shard_n}') for shard_n in range(n_shards)] + for shard in shards: + sink = wds.TarWriter(shard) + for galaxy_n in range(shard_size): + data = { + "__key__": f'id_{galaxy_n}', + "image.jpg": (np.random.rand(424, 424)*255.).astype(np.uint8), + "labels.json": json.dumps(dict(zip(label_cols, [np.random.randint(low=0, high=10) for _ in range(len(label_cols))]))) + } + sink.write(data) + + + +def df_to_wds(df: pd.DataFrame, label_cols, save_loc, n_shards): + df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') + + shard_dfs = np.array_split(df, n_shards) + logging.info('shards: ', len(shard_dfs)) + logging.info('shard size: ', len(shard_dfs[0])) + for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): + shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') + logging.info(shard_save_loc) + sink = wds.TarWriter(shard_save_loc) + for _, galaxy in shard_df.iterrows(): + sink.write(galaxy_to_wds(galaxy, label_cols)) + sink.close() + + def galaxy_to_wds(galaxy: pd.Series, label_cols): im = cv2.imread(galaxy['file_loc']) @@ -26,43 +56,26 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols): im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) labels = json.dumps(galaxy[label_cols].to_dict()) id_str = str(galaxy['id_str']) - # print(id_str) return { "__key__": id_str, "image.jpg": im, "labels.json": labels } -def df_to_wds(df: pd.DataFrame, label_cols, save_loc, n_shards): - df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') - - shard_dfs = np.array_split(df, n_shards) - print('shards: ', len(shard_dfs)) - print('shard size: ', len(shard_dfs[0])) - for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): - shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') - print(shard_save_loc) - sink = wds.TarWriter(shard_save_loc) - for index, galaxy in shard_df.iterrows(): - sink.write(galaxy_to_wds(galaxy, label_cols)) - sink.close() - -def check_wds(wds_loc): +# just for debugging +def load_wds_directly(wds_loc): dataset = wds.WebDataset(wds_loc) \ .decode("rgb") for sample in islice(dataset, 0, 3): - print(sample['__key__']) - print(sample['image.jpg'].shape) # .decode(jpg) converts to decoded to 0-1 RGB float, was 0-255 - print(type(sample['labels.json'])) # automatically decoded - -def identity(x): - # no lambda to be pickleable - return x + logging.info(sample['__key__']) + logging.info(sample['image.jpg'].shape) # .decode(jpg) converts to decoded to 0-1 RGB float, was 0-255 + logging.info(type(sample['labels.json'])) # automatically decoded -def load_wds(wds_loc): +# just for debugging +def load_wds_with_augmentation(wds_loc): augmentation_transform = default_transforms() # A.Compose object def do_transform(img): @@ -74,32 +87,11 @@ def do_transform(img): .map_tuple(do_transform, identity) for sample in islice(dataset, 0, 3): - print(sample[0].shape) - print(sample[1]) - - -def main(): - - train_catalog, _ = gz2(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=True, train=True) - # print(len(train_catalog)) - # exit() - divisor = 4096 - batches = len(train_catalog) // divisor - print(batches) - train_catalog = train_catalog[:batches*divisor] - print(len(train_catalog)) - label_cols = label_metadata.gz2_ortho_label_cols - - save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz2/gz2_train.tar" - - df_to_wds(train_catalog, label_cols, save_loc, n_shards=batches) - - # check_wds(save_loc) - - # load_wds(save_loc) - - import zoobot.pytorch.training.webdatamodule as webdatamodule + logging.info(sample[0].shape) + logging.info(sample[1]) +# just for debugging +def load_wds_with_webdatamodule(save_loc, label_cols): wdm = webdatamodule.WebDataModule( train_urls=glob.glob(save_loc.replace('.tar', '_*.tar')), val_urls=[], @@ -112,14 +104,12 @@ def main(): for sample in islice(wdm.train_dataloader(), 0, 3): images, labels = sample - print(images.shape) - # print(len(labels)) # list of dicts - print(labels) - exit() + logging.info(images.shape) + # logging.info(len(labels)) # list of dicts + logging.info(labels) - -if __name__ == '__main__': - - main() +def identity(x): + # no lambda to be pickleable + return x diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b55d9fed..3b9c238d 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -316,6 +316,10 @@ def get_pytorch_encoder( """ # num_classes=0 gives pooled encoder # https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/efficientnet.py + + # if architecture_name == 'toy': + # logging.warning('Using toy encoder') + # return ToyEncoder() # support older code that didn't specify effnet version if architecture_name == 'efficientnet': @@ -357,3 +361,22 @@ def get_pytorch_dirichlet_head(encoder_dim: int, output_dim: int, test_time_drop modules_to_use.append(efficientnet_custom.custom_top_dirichlet(encoder_dim, output_dim)) return nn.Sequential(*modules_to_use) + + +# class ToyEncoder(pl.LightningModule): + +# def __init__(self): +# super(ToyEncoder, self).__init__() + +# self.conv1 = nn.Conv2d(3, 6, 5) +# self.pool = nn.MaxPool2d(2, 2) +# self.conv2 = nn.Conv2d(6, 16, 5) +# # pool again +# self.fc1 = nn.Linear(16 * 5 * 5, 1280) # dim 1280, like effnetb0 + +# def forward(self, x): +# x = self.pool(nn.functional.relu(self.conv1(x))) +# x = self.pool(nn.functional.relu(self.conv2(x))) +# x = x.view(-1, 16 * 5 * 5) +# x = nn.functional.relu(self.fc1(x)) +# return x diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 660dc939..8c6a973e 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -11,7 +11,7 @@ from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule from zoobot.pytorch.estimators import define_model -from zoobot.pytorch.training import webdatamodule +from zoobot.pytorch.datasets import webdatamodule def train_default_zoobot_from_scratch( From d67fe54fc0c725e8d97adc6700d26cf5d0bffb79 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 13:20:13 -0500 Subject: [PATCH 103/307] mock wds training works locally, try on beluga --- only_for_me/narval/gpu_split.py | 117 +++- zoobot/pytorch/datasets/__init__.py | 0 zoobot/pytorch/datasets/webdatamodule.py | 20 +- zoobot/pytorch/datasets/webdataset_utils.py | 43 +- zoobot/pytorch/training/temp.ipynb | 569 ++++++++++++++++++++ 5 files changed, 719 insertions(+), 30 deletions(-) create mode 100644 zoobot/pytorch/datasets/__init__.py create mode 100644 zoobot/pytorch/training/temp.ipynb diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index e42a743a..6a93ad2b 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -1,22 +1,121 @@ # import datetime +import logging +import os +import time import torch -from torch import nn import torch.nn.functional as F import pytorch_lightning as pl -# import torchvision -import torchvision.transforms as transforms -from torchvision.datasets import CIFAR10 -from torch.utils.data import DataLoader - import argparse +from zoobot.pytorch.datasets import webdataset_utils +from zoobot.shared import schemas +from zoobot.pytorch import datasets + +from torch import nn + +class ToyLightningModule(pl.LightningModule): + + def __init__(self): + super(ToyLightningModule, self).__init__() + + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + # pool again + # shape (B, F, H, W) + + def forward(self, x): + x = self.pool(nn.functional.relu(self.conv1(x))) + x = self.pool(nn.functional.relu(self.conv2(x))) + time.sleep(1) + return torch.mean(x, dim=(1, 2, 3)) # shape (B) + + + def training_step(self, batch, batch_idx): + images, labels = batch + y_hat = self(images) # mean after some convs + y = labels[:, 0].float() / 20. # first random number, divided by a big number to now be below 0 + loss = F.cross_entropy(y_hat, y) + return loss # meaningless but mathematically works + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=1e-3) + + def main(): - - shards = [f'shard_{x}' for x in range(4)] - + + logging.basicConfig(level=logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument('--save-dir', dest='save_dir', type=str) + parser.add_argument('--batch-size', dest='batch_size', default=16, type=int) + parser.add_argument('--gpus', dest='gpus', default=1, type=int) + parser.add_argument('--nodes', dest='nodes', default=1, type=int) + parser.add_argument('--mixed-precision', dest='mixed_precision', + default=False, action='store_true') + parser.add_argument('--debug', dest='debug', + default=False, action='store_true') + parser.add_argument('--wandb', dest='wandb', + default=False, action='store_true') + parser.add_argument('--seed', dest='random_state', default=1, type=int) + args = parser.parse_args() + + if os.path.isdir('/home/walml/repos/zoobot'): + save_dir = '/home/walml/repos/temp' + + else: + save_dir = '/project/def-bovy/walml/temp' + + schema = schemas.decals_all_campaigns_ortho_schema + + shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=32) + # exit() + # webdataset_utils.load_wds_directly(shards[0], max_to_load=None) + # webdataset_utils.load_wds_with_webdatamodule(shards, label_cols=schema.label_cols, max_to_load=None) + # exit() + + train_shards = shards[:8] + val_shards = shards[8:] # not actually used + + datamodule = datasets.webdatamodule.WebDataModule( + train_urls=train_shards, + val_urls=val_shards, + batch_size=args.batch_size, + num_workers=1, + label_cols=schema.label_cols, + cache_dir=None + # TODO pass through the rest + ) + use_distributed_sampler=False + + trainer = pl.Trainer( + # log_every_n_steps=16, # at batch 512 (A100 MP max), DR5 has ~161 train steps + accelerator='gpu', + devices=args.gpus, # per node + num_nodes=args.nodes, + # strategy='auto', + precision='16-mixed', + logger=False, + # callbacks=callbacks, + max_epochs=1, + default_root_dir=save_dir, + # plugins=plugins, + use_distributed_sampler=use_distributed_sampler + ) + + # logging.info((trainer.strategy, trainer.world_size, + # trainer.local_rank, trainer.global_rank, trainer.node_rank)) + + lightning_model = ToyLightningModule() + + trainer.fit(lightning_model, datamodule) # uses batch size of datamodule + + # batch size 16 + # shard size 16, 10 shards with 8 being assigned as training shards so 8*32 train images, 8*2 train batches + if __name__=='__main__': main() \ No newline at end of file diff --git a/zoobot/pytorch/datasets/__init__.py b/zoobot/pytorch/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 40cab0cd..27e0c408 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -42,11 +42,11 @@ def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_c logging.info(f'Creating webdatamodule with WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') - print("train_urls = ", self.train_urls) - print("val_urls = ", self.val_urls) - print("train_size = ", self.train_size) - print("val_size = ", self.val_size) - print("batch_size", self.batch_size, "num_workers", self.num_workers) + logging.info(f"train_urls (before hardware splits) = {len(self.train_urls)} e.g. {self.train_urls[0]}", ) + logging.info(f"val_urls (before hardware splits) = {len(self.val_urls)} e.g. {self.val_urls[0]}", ) + # logging.info("train_size (before hardware splits) = ", self.train_size) + # logging.info("val_size (before hardware splits) = ", self.val_size) + logging.info(f"batch_size: {self.batch_size}, num_workers: {self.num_workers}") def make_image_transform(self, mode="train"): # if mode == "train": @@ -69,7 +69,7 @@ def label_transform(label_dict): def make_loader(self, urls, mode="train"): if mode == "train": dataset_size = self.train_size - shuffle = 5000 + shuffle = min(self.train_size, 5000) elif mode == "val": dataset_size = self.val_size shuffle = 0 @@ -81,7 +81,9 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0 + # , nodesplitter=nodesplitter_func + ) .shuffle(shuffle) .decode("rgb") .to_tuple('image.jpg', 'labels.json') @@ -89,7 +91,7 @@ def make_loader(self, urls, mode="train"): # torch collate stacks dicts nicely while webdataset only lists them # so use the torch collate instead .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - .repeat(5) + # .repeat(5) ) # from itertools import islice @@ -119,7 +121,7 @@ def make_loader(self, urls, mode="train"): loader.length = dataset_size // self.batch_size # temp hack instead - assert dataset_size % self.batch_size == 0 + assert dataset_size % self.batch_size == 0, (dataset_size, self.batch_size, dataset_size % self.batch_size) # if mode == "train": # ensure same number of batches in all clients # loader = loader.ddp_equalize(dataset_size // self.batch_size) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index cc299b30..a4a22cc8 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -1,4 +1,5 @@ import logging +from typing import List import os import cv2 import json @@ -10,18 +11,17 @@ import pandas as pd from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets -from galaxy_datasets.shared import label_metadata from galaxy_datasets import gz2 from galaxy_datasets.transforms import default_transforms -from galaxy_datasets.pytorch import galaxy_dataset import webdataset as wds import zoobot.pytorch.datasets.webdatamodule as webdatamodule -def make_mock_wds(save_dir, label_cols, n_shards, shard_size): - shards = [os.path.join(save_dir, f'mock_shard_{shard_n}') for shard_n in range(n_shards)] +def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: int): + counter = 0 + shards = [os.path.join(save_dir, f'mock_shard_{shard_n}_{shard_size}.tar') for shard_n in range(n_shards)] for shard in shards: sink = wds.TarWriter(shard) for galaxy_n in range(shard_size): @@ -31,6 +31,9 @@ def make_mock_wds(save_dir, label_cols, n_shards, shard_size): "labels.json": json.dumps(dict(zip(label_cols, [np.random.randint(low=0, high=10) for _ in range(len(label_cols))]))) } sink.write(data) + counter += 1 + print(counter) + return shards @@ -63,12 +66,16 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols): } # just for debugging -def load_wds_directly(wds_loc): +def load_wds_directly(wds_loc, max_to_load=3): dataset = wds.WebDataset(wds_loc) \ .decode("rgb") - for sample in islice(dataset, 0, 3): + if max_to_load is not None: + sample_iterator = islice(dataset, 0, max_to_load) + else: + sample_iterator = dataset + for sample in sample_iterator: logging.info(sample['__key__']) logging.info(sample['image.jpg'].shape) # .decode(jpg) converts to decoded to 0-1 RGB float, was 0-255 logging.info(type(sample['labels.json'])) # automatically decoded @@ -91,25 +98,37 @@ def do_transform(img): logging.info(sample[1]) # just for debugging -def load_wds_with_webdatamodule(save_loc, label_cols): +def load_wds_with_webdatamodule(save_loc, label_cols, batch_size=16, max_to_load=3): wdm = webdatamodule.WebDataModule( - train_urls=glob.glob(save_loc.replace('.tar', '_*.tar')), - val_urls=[], + train_urls=save_loc, + val_urls=save_loc, # not used # train_size=len(train_catalog), # val_size=0, label_cols=label_cols, - num_workers=1 + num_workers=1, + batch_size=batch_size ) wdm.setup('fit') - for sample in islice(wdm.train_dataloader(), 0, 3): + if max_to_load is not None: + sample_iterator =islice(wdm.train_dataloader(), 0, max_to_load) + else: + sample_iterator = wdm.train_dataloader() + for sample in sample_iterator: images, labels = sample logging.info(images.shape) # logging.info(len(labels)) # list of dicts - logging.info(labels) + logging.info(labels.shape) def identity(x): # no lambda to be pickleable return x +if __name__ == '__main__': + + save_dir = '/home/walml/repos/temp' + from galaxy_datasets.shared import label_metadata + label_cols = label_metadata.decals_all_campaigns_ortho_label_cols + + make_mock_wds(save_dir, label_cols, n_shards=4, shard_size=512) diff --git a/zoobot/pytorch/training/temp.ipynb b/zoobot/pytorch/training/temp.ipynb new file mode 100644 index 00000000..e126a866 --- /dev/null +++ b/zoobot/pytorch/training/temp.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Relative Time (Wall)scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestampscratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtimescratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpuscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpuscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MAX
0NaN1.699053e+091.699053e+091.699054e+09630.15553260.1043591200.18725561.96605310.5390.93000
\n", + "
" + ], + "text/plain": [ + " Relative Time (Wall) \\\n", + "0 NaN \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp \\\n", + "0 1.699053e+09 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MIN \\\n", + "0 1.699053e+09 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MAX \\\n", + "0 1.699054e+09 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime \\\n", + "0 630.155532 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MIN \\\n", + "0 60.104359 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MAX \\\n", + "0 1200.187255 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu \\\n", + "0 61.966053 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MIN \\\n", + "0 10.53 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MAX \\\n", + "0 90.93 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu \\\n", + "0 0 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MIN \\\n", + "0 0 \n", + "\n", + " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MAX \n", + "0 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('/home/walml/Downloads/wandb_export_2023-11-07T13 52 16.196-05 00.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# how annoying, it doesn't export the values themselves\n", + "\n", + "# transcribe manually\n", + "# https://wandb.ai/jbca-ice/narval?workspace=user-walmsley" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "data = [\n", + " (0., 0.), # by definition\n", + " (1.002, 32.4),\n", + " (1.502, 65.93),\n", + " (2.002, 75.53),\n", + " (2.502, 69.6),\n", + " (3.002, 82.67),\n", + " (3.502, 90.93),\n", + " (4.002, 82.93),\n", + " (4.502, 90.97),\n", + " (5.002, 70.),\n", + " (5.502, 41.53),\n", + " (6.002, 16.93),\n", + " (6.502, 34.33),\n", + " (7.002, 55.13),\n", + " (7.502, 58.16),\n", + " (8.002, 87.53),\n", + " (8.502, 81.13),\n", + " (9.002, 80.73),\n", + " (9.503, 84.67),\n", + " (10.03, 87.27),\n", + " (11.003, 83.4),\n", + " (11.503, 35.6),\n", + " (12.003, 11.93),\n", + " (12.503, 20.67),\n", + " (13.003, 41.2),\n", + " (13.503, 53.2),\n", + " (14.003, 69.6),\n", + " (14.503, 88.67),\n", + " (15.003, 75.73),\n", + " (15.503, 83),\n", + " (16.003, 80.33),\n", + " (16.503, 89.2),\n", + " (17.003, 65.67),\n", + " (17.503, 11.4),\n", + " (18.003, 10.53),\n", + " (18.503, 40.87),\n", + " (19.003, 52.67),\n", + " (19.503, 75.8),\n", + " (20.003, 84.13)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data=data, columns=['time', 'utilisation'])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timeutilisation
00.0000.00
11.00232.40
21.50265.93
32.00275.53
42.50269.60
53.00282.67
63.50290.93
74.00282.93
84.50290.97
95.00270.00
105.50241.53
116.00216.93
126.50234.33
137.00255.13
147.50258.16
158.00287.53
168.50281.13
179.00280.73
189.50384.67
1910.03087.27
2011.00383.40
2111.50335.60
2212.00311.93
2312.50320.67
2413.00341.20
2513.50353.20
2614.00369.60
2714.50388.67
2815.00375.73
2915.50383.00
3016.00380.33
3116.50389.20
3217.00365.67
3317.50311.40
3418.00310.53
3518.50340.87
3619.00352.67
3719.50375.80
3820.00384.13
\n", + "
" + ], + "text/plain": [ + " time utilisation\n", + "0 0.000 0.00\n", + "1 1.002 32.40\n", + "2 1.502 65.93\n", + "3 2.002 75.53\n", + "4 2.502 69.60\n", + "5 3.002 82.67\n", + "6 3.502 90.93\n", + "7 4.002 82.93\n", + "8 4.502 90.97\n", + "9 5.002 70.00\n", + "10 5.502 41.53\n", + "11 6.002 16.93\n", + "12 6.502 34.33\n", + "13 7.002 55.13\n", + "14 7.502 58.16\n", + "15 8.002 87.53\n", + "16 8.502 81.13\n", + "17 9.002 80.73\n", + "18 9.503 84.67\n", + "19 10.030 87.27\n", + "20 11.003 83.40\n", + "21 11.503 35.60\n", + "22 12.003 11.93\n", + "23 12.503 20.67\n", + "24 13.003 41.20\n", + "25 13.503 53.20\n", + "26 14.003 69.60\n", + "27 14.503 88.67\n", + "28 15.003 75.73\n", + "29 15.503 83.00\n", + "30 16.003 80.33\n", + "31 16.503 89.20\n", + "32 17.003 65.67\n", + "33 17.503 11.40\n", + "34 18.003 10.53\n", + "35 18.503 40.87\n", + "36 19.003 52.67\n", + "37 19.503 75.80\n", + "38 20.003 84.13" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import PercentFormatter" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAC5CAYAAADOB4NQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABLiklEQVR4nO2dd1hTZ/vHvwkQIGHvGZYgshRlKlLFVUer4nrdWFe1Vm193VscrX21fdXW1trWgf21tsXVqnW+daIiQ0EQZe+9IYQk5/cHJZISIIGQAc/nurguPfM+T84593nuSaMoigKBQCAQeiV0RQtAIBAIBMVBlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL4YoAQKBQOjFECVAIBAIvRiiBAgEAqEXQ5RAL+Xw4cOYOXOmosXodZBx7z7I2HYOdUUL0FuZO3cuHj9+3Gr5wYMHMX78eAVI1DHXrl3DmTNnkJCQgJqaGiQmJkJdXbVuIVUc96+//hp//vknMjIywGKxMHToUKxduxZGRkaKFk0EVRzbY8eOITIyEvn5+dDS0oK3tzfWr18PBwcHRYsmN1TrCe5hzJ8/H4sXLxZZpqenpyBpOqa+vh4BAQEYPHgwDh48qGhxOo2qjfvTp08RFhYGDw8P1NbWIjw8HKtXr8apU6cULVorVG1sbW1tsW3bNtja2qK2thaHDx/G0qVLce3aNUWLJjeIOUiBaGtrw9TUVORPU1MTwJup7fHjxxEYGAhfX18cPHgQLev9ZWRk4L333oOXlxcCAwPx6aefgsfjCdfX1dVh165dGDJkCLy8vDB58mTEx8eLyBAREYGgoCD4+/tj//79aK+e4MSJE7Fs2TIMGDBAtgMhZ1Rt3L/99ltMnDgRTk5O8PLywqZNm/Do0SNUV1fLeGS6jqqN7dixYzF48GDY2trC1dUVK1euRGZmJkpKSmQ8MsoLmQkoMcnJyTA2NsapU6eQmpqKTZs2wd7eHqGhoeDz+Vi+fDnYbDZ++eUXFBQUYOPGjdDX18f7778PANi6dSsSExPx6aefgs1mIykpCQKBQHj8ly9fgs1m4+TJk0hPT8fq1avh4+ODkJAQRV2yUqDs415eXg5NTU1oa2t3y/V3J8o8thwOB+fOnYODg4PSmdq6FYqgEObMmUO5u7tTAwYMEPnLysqiKIqiDh06RHl5eVEVFRXCfQ4ePEhNnjyZoiiK+uuvvyhPT0+qvLxcuP7HH3+k/P39KYqiqKysLMrFxYV69uyZ2PMfOnSI8vX1pTgcjnDZe++9R33yyScdyh4VFUW5uLhQjY2NUl+3olHlcacoimpoaKBCQ0OprVu3SnXd8kBVx/bWrVvUgAEDqL59+1JjxowRyttbIDMBBTJt2jSEhYWJLLOwsBD+m81mQ19fX/h/Ly8voR04PT0ddnZ2MDAwEK739vZGeXk5Kioq8OrVKzCZTHh6erZ5fjs7O+FUHQBMTExQWlraxatSflR13Pl8Pv79738DANavX9/h9opAFcfW398f58+fR3FxMX744QesWbMGZ86cgYaGhiSXrPJIrQTi4uIQExODvLw8cDgcGBoawtXVFf7+/jAxMekOGXssenp6sLOza3M9jUZrcx3VQS+gjtYDaBXZQ6PRwOfzO9xP1VHFcRcIBNiwYQPS09Nx+vRpsFisDs+jCFRxbJlMJuzs7GBnZwcvLy/4+fnhzp07GDFiRIfn6wlI5Biur6/H0aNHMXz4cISFheHatWsoLCwEh8NBcnIyDhw4gGHDhuH9999HbGxsd8vca8jMzERVVZXw/8+fPxeGrjk6OiIzMxMVFRXC9bGxsTAyMoKBgQFcXFxQV1eH58+fy1tslUfZxp2iKGzevBnx8fH4/vvvRb6UVQ1lG1txUBSlcqHPXUGiKx0zZgwGDx6M3bt3IyAgAGpqaq22KSwsxJUrV7Bx40bMmzcPs2bNkrmwPY36+noUFxeLLNPR0RE6/Oh0OrZs2YKVK1ciLS0Np0+fxqZNmwAAQUFBsLGxwYYNG/Dxxx8jPz8fhw8fxvz58wE0hb5NmDABa9euxdatW8Fms/Hy5UuYmJh0OrqnoqIC+fn5yMrKAtDk5FNTUwObzVbaL1NxqNq4b9++Hbdv38axY8cAQCi7kZGR2GdRkaja2H722WcYOXIkzMzMUFpaimPHjsHQ0BADBw7s/CCoGBIpgZ9++glWVlbtbmNubo6wsDDMnz8fRUVFMhGup3Py5EmcPHlSZNm6deuwcOFCAICrqys8PDwwe/Zs8Pl8zJw5E6GhoQCaHqavvvoKO3fuxNSpU8FisTBp0iQsWrRIeKzw8HB8+umn+Pjjj8HhcODk5IQdO3Z0Wt5bt25h48aNwv9PmTIFAHDq1Cn4+/t3+rjyRtXG/eeffwbQZG9vyc2bN2FjY9Pp43YHqja2+fn5WLVqFcrKymBkZAQfHx+cOHECurq6nT6mqkGjJDG0EeTO4cOH8eDBA/zf//2fokXpVZBx7z7I2ConnTZ8VVVV4dChQ4iJiYFAIED//v3x4YcfEucwgUAgqBCdzhhev3499PX1sX//fuzbtw98Ph+rV6+WoWgEAoFA6G4kNgft2rULH3/8MXR0dAA0OYsvXLgALS0tAE2Zem0VkCIQCASCciKxOcjW1hahoaFYvnw5Jk2ahKlTp2L69OkIDg4Gj8fDtWvX8K9//as7ZSUQCASCjJHKMVxYWIhPPvkEhYWF2LZtGwQCAaKjo4U+AW9v7+6UlUAgEAgyRiqfgLm5OT7//HOsWLECa9aswblz5xAaGoqwsLBOKYCIiAiEhobCw8MDH330kci6lJQUTJ8+Hf3798eECRMQHR0tsv7q1asYMWIEBgwYgPfeew+FhYXCdZcuXcKQIUMQEhKCR48eCZdXVlYiNDQUtbW1UstKIBAIPRGplEB5eTmeP38ODw8PnD9/HoaGhpg0aRIuXbrUqZObmZlh+fLlmD59usjyxsZGLFu2DCNHjsSTJ0+wePFiLF++HJWVlQCA1NRUbNy4EeHh4YiKioKdnR3WrFkDAODxeAgPD8epU6ewfft2hIeHC4/7n//8B8uWLVOpxCYCgUDoTiT2Cfz222/Yt28fHB0dkZeXh82bN2P58uV45513sGfPHpw9exbbt29Hnz59JD756NGjAQBJSUkoLy8XLn/8+DE4HA4WLVoEOp2OiRMn4uTJk7h27RqmTZuGixcvIjg4GIMHDwYArFq1CkOGDEFWVhaYTCYYDAacnJxga2uL7OxsAE3p5yUlJRg1alSHchUVFbXKegSawmJTU1Ph5uYmUqSKQCAQ5E1DQwNycnIQFBTUpdLXEiuBzz//HF9++SX8/f2Rm5uLJUuWYOzYsbC1tcXXX3+NGzdu4IMPPsCff/7ZaWGaefXqFVxcXECnv5mouLq64tWrVwCaTEVeXl7CdQYGBrC0tERKSoqwbnhKSgry8/PRp08f8Hg8fPrppzhw4IBE5//5559x5MiRLl8HgUAgdDefffYZ3n333U7vL1WyWPNLWU1NTaSRAwCMHDkSQUFBnRakJbW1ta3StvX09ISdlOrq6sSur62tBZ1Ox2effYYtW7aAwWBg9+7dOHnyJEaOHInq6mosXLgQXC4XH374Ifz8/MSef8aMGWKbUCQlJWHz5s347LPP4OTkJJNrJRAIhM6QmpqKtWvXdrl0iMRKYPXq1Vi+fDmcnJyQk5ODDRs2tNqmOWegq7BYLNTU1Igsq66uFtrymUxmu+sDAwMRGBgIAMjLy8P169cRERGBf/3rX9i4cSPMzc0xZ84c3L59W2xpWzMzM5iZmbUpn5OTE9zd3bt0jQQCgSALumqallgJTJ06FSEhIcjJyQGbze7WcrbOzs44fvw4BAKBcPaRlJSEmTNnAgBcXFyQnJws3L6yshL5+flwcXFpdaw9e/Zg3bp1UFdXR0pKCjw9PcFgMMDj8VBWVgZjY+Nuuw4CgUBQdqSKDjIyMoKXl5fMFACPx0NDQwN4PB4EAgEaGhrQ2NgIPz8/MBgMfP/99+Byubh06RJycnKETt13330Xd+7cwcOHD8HhcHDo0CEMGDAAbDZb5Pg3b96EsbGxsCysjY0NoqKi8Pr1a3C5XJWuy04gEAgyQZIelAsWLKCioqI63K6srIz68ssvqdOnT0vU2/LQoUOUi4uLyN/69espiqKo5ORkaurUqZSnpyc1btw46vHjxyL7Xr58mQoJCaG8vLyoBQsWUAUFBSLra2trqdDQUJF+pg8ePKCGDx9ODRkyhPr9998lkrElCQkJlIuLC5WQkCD1vgQCgSBLZPU+kihj+MqVK/jvf/+LhoYGDB06FG5ubjA1NQWDwUB1dTXS0tIQExOD2NhYvPvuu1ixYgVMTU3locPkSmJiIkJDQxEZGanSPgFOIx9pxTWIziyHr70RHExY0NKQrDlJV/YlEAiyQ1bvI4l8AmPHjsXYsWNx//59/Pnnnzhz5gzy8vKEJhVXV1cEBwfjs88+IzZ2JYfTyMfmc8/xW0yucNmUQdbYM8mzw5d5V/YlEAjKiVQhokOGDMGQIUO6SxaCHEgvqRV5iQPAb09zsSjIEf0s9bptXwJB0ZBZrHh6TzdlGSIQCMAX8BUthtTQQMOTjDKx66IzytDXXAcUxFsHu7KvMkPDmxBhecmviHMqE418tHoZd/e7uJGPNmexqqoH/pmr1VmIEugEUblRyNXO7XhDJcNCxwIDbNli1w1g6+FZ0TMU1BSIXW/OsoCHdef2VUb0NPXgaTYQ6SV1iM2sxEA7fdibMPG8KAZVDVU95pzKBJ1GR6BNMHZcSBZ9GQ+0xo6JrniYcwcCSjYvtpZY6FhAjc8WO4sNG8wGXy1Lpe7dZrJzs2VyHKIEOoEuQxdGWp2v1aEoMkvqEchmYspA61YPob0xC8kljWKvi6IofHOrHDvfcRW7r6MJCy+Kxe+rjNBAg5upF7ZdSGp1LbsmDsKL4mcy/0JXxDmVDSNtI2SW1Ld+GcfkYkGQHRwNHFFWL3622RWMtY1xO7FS7LrYrEoMdzMGl8eV+Xm7mwpGhUyOQ5RAJ6CDDjW6as0hK+r42HOxEha6T3B6kR/eC7LD08wKDLA1gLmeNlb+9BTzhmpAT7v1dZ19VIOrz+ugrRGH8InemD+4aV9vtgGczVjIqkoVqfOk7BhqGSK9tE7sy+i9IDsYM41RzilvY+/2oYEGAy0DaKprooHXgApOBShQTecsEX/OBV08p6qgraGNBy8rxK6LyazACHdtqDXI/rni8rnwtDEUu26QnQEa+eUq9zw332eyQHWeXEKn4TQKsPdiOYqq+BjiCqSWJ0Jbqxwj3BnQYZbju4dPcDu5HHsvlqOhsemFZaFjAUMtQ9x8UY+fH9XA0VQdU/zUkFKaCKZ2GWoauPgtJhPJJc9R11in6EuUCk11TcRkVIhd9zSzAprqnUvDZ2ow0dfYA3X1hriZwEUdxxB9jd2RXaKOuEwKT9o455P0CjxJE+BmYh1Kq8X7mmigifwuLf0KqkIDrwED7Q3ErhtkZ4AGXkOHx+jMOPzfozxYGTTNgFvSPAOu4FRIIr7S0HyfNTToy+R4nZoJpKSk4MmTJygrK2vlnFi1apVMBCN0jeYvBQ01TfzyuAw5ZQJM9WNhpDsTFCiRr84xnkyUVPORkq8GF2MP5FfW415mBQawDTHfzwqJWTGYF6wObQYdFChUNlQgqbAKl+Pr4GVnDEczDQVeqfQ0vYzEfxkOZBuggSf9FzkNNLD1nLDtwotW5p51b7vikyvJmBMg3qfS30YfP9zPwKVnTX4BGyN1DGAzMICtCTdrBgyZLLD1nJBeUot7mRUYaG+IvsY2yKpKVSkFXMGpQF9jG7EmRTtjJlJK09vdn6nBlHoc4jIb8P2dKqQVxuDQzEEIG2yH6H/MYlXJDNfyPou89QwMGRxTaiVw4sQJfPLJJ7C3t2+VECauGBtB/rR8WKIzKhDsbI35gc4orEtDPa9e7D6zA3Vhb+CBvX+0tlkfmTUIr8oSRR6WIBctXI6vw92UepVTAk0vI2uxLyNzPW3E5r+GiY6aWLNOWxhoGSC9VEwIbUwu5gbaYc5gHbia6Yg9Z18LHUzxF8DJUh/xWVzEZzXg97g6/B5XByaDjjvrPMQql10T3fCyNEFlXmIUKKSVv8b6t10xy5+NxLxKoTny8O1nGO3VtmGiPSXb1jjkVfBw8GoFtBk0TP57FqvP1EdtAw+RMZmY5CtQmbFrpuV9Jqu3rdRK4LvvvsOuXbtadQMjKAedeVgAwFDbEPkVbdvJDbQMRGYPLhYaMNWl40EKB3OH6IKuQh8AFCjcT0/BurddMTfQDs9yKjDQzgDWBkzM/e4x9LXo+HquDzJL68R+cYqz+3O4GniSXiH2fM9ymuzdWVWp2DXRTeiPGWRnAHvjpq9RXW0ahvbVxtC+2qAoCtllPMRlcqHLMEBGG74Ecb+LsnP3VTn+++dt7JxkhxHu+uDwyrDgRDpeFjTAw9YEVobiX0ntKdnmcajgVAh/lyoOBwcvp6KeS2Hzu4awMlAHBQoVnAo8y6/Etef1GORkAhsj1XKLaqpr4l4bZsXOIrVPgMvlwt/fX6ZCEGRHew9LRmltm84kTXVNxGRWiF0nzk5Oo9EwxEUbJTUCvMxvlIXocuX+63IE77+N8voijHBngKlVjvSKRLzdn46DMwZhx8UkTDj8ANsuvsCEQw+w7cILsPWcWtn9a+sNYafvjj/iauFpLd5G22zvrmusw8vSBKE/RlurHC9LE1qZMmg0GtjGGnh3IAszA4zxIk98ZEtX/BeK4taLejTyBTDSq0FBTQEqOBWYPYQFvgA4db+6zf3a8+NEZ1SAqcES+V3qOYb4aUkw1o2zxAA70THyc2gqeR+dzpHZdckLDo8DTxsDmR5TaiUwc+ZM/PrrrzIVgiA7Ouv07IzTLsil6WG6l6J6D1NiLheNfAE0GFUoqClAOaccFCiM87BEYZX4MMbUolrY6DbNspoVxDuHH2DvH0lYEGQPFwtWh87HZn9My3O2hyycqcpCeS0fMRkN6M9mwFjnTTSOqyUDgX008SStAYk54kM1G3gN6M82ELvOw1ofmnRDkd8l9KuH2H81GXP93Fs5jz1sGNDSoOFxmuqMXTNNTm7tVvdZV5B6LpSbm4vbt2/jf//7H5ydnaGuLnqI/fv3y0w4ZUUA5c0Yrm+sR39b8fWbBtoZoL6xVKzspXWlcDMVbye3N2biRfHrVi8sWyMarA3V8PBVPeYHMaFGVw2TkEBAISmPCyczdTDUKZHx0FDTQFxWhdj9yuoaQRW3EeY5xA76rBrsmtgPC4LsEJPZZGJyMGYivSIFPAGvU7J25ndRVv6XVAcBBQzrp9XqHpwZyMKTtAb8cLcK+6YbtDIv/vE8F9O9rcT7VMx18bKwus3wWz1NPZH8AzodGMDWwKNULkprGmHAVI0gyUepDfjurzdO7rdM67H2dtePK7USUFdXl6hZe0+mmluNMo7sk1pkQXFtOXyt2orA0MbDnLQ2szLji55ix8SBCBvCRmxmJbz/zmiNL3raZkart4MAv8dQeJhWBjcb1VAC2aUUahsoOJrzW/2ODHUGvO3ER/G4Wujgz8RCsetiMisQ0LcR93P+ghnLDAEuTNQ1ZuJ+TlGXs2Bb/i7RGZXob6MPZ4v2fxdlg6Io3HghAEsTcLSsQRmnVmQ9QxMY5k7Djec8XE0oRYDzmxdzdKoA3/2Pwv2UJzg217fV/Vlan4eEXPFjHJNRgYC+gla/cz8bAaJSgbuvyjCkr/IrgbQiCoeuCaCrDYT0r8SDnL/AU6vpeEcJkFoJ7Nu3TyYnVmUCrAPg5ugml3NJW2fm27vpOHrtCc4sCsCioY6IziiDT4v6LCMcRnR4DHdLA7hbGgjP6W/dtg/ISbcGv8fcQ36xJVYFe3Z8QUrAD/kZAJIx3Xsghjm2LnneyG+qK/PbU9E6M0YsLfg5iJ9l+ToYo4+hDpwMu6/3tLulAeKyqvDr0xyET3Jv93dRNuKyK1BQEYV5gWyMcxb/7PhZNiLk9R3ceqaODSOCwVCnIyarHGfuP4GZrjp2T+wPFkOj1f3J0rCFn4P4F2Jbv4uvBRen7t5GXrEJRo0dJNNrlRXNz35xTQN2nH0AdXojTi7wg5dNk+/pRd0LmZyn067x1NRUpKc3xfU6OjrC0dFRJgI14+3tLfL/hoYGBAcH4+uvvwYAhISEoKSkBGpqTbZFKysr/PHHHwCA5ORkfPzxxygrK8P777+PsLAw4XEWLlyIjz76CB4eHp2WjU6XT8ZwU9XDaomrHpbXcvHl7VToa2tAU52OfpZ63V7d09lcH+5WeriaWIjdkz2hqa78mZdPMspBpwF+DsZif0c1OrBnkicWBYkqUYY6HQ4mLLEKwsGEJZes6disCkTG5mLDuH7Q01Kd0Nxfn+YBAKb7sNt8doxYagif6A4/R2O8LKxBXHYF3K308NfaYeDyBLAxZLZ5fGl/F2MdbfjZG+He61I08CgwGcoVJdTy2few0seFFUOQXlILb/ab0iyyut+kvvLS0lKsW7cO9+/fh55e0wumuroaQ4YMwf79+2FkJJv6MbGxscJ/8/l8DBs2DGPHjhXZ5siRIwgODm6174EDBxAWFoZhw4bhnXfewfjx42FqaopLly7BwcGhSwpAXnSmdv+hW69QzeFhz2T51vd/t78V9l1Jxp2UEoxyM5fbeTuDQEDhcXoZPKz1odvOS1RLQ02sEtXSUBOrIOQ13o6mLABAenEt+tsayOWcXaWey8fv8Xlws9SDRxsRVM2Mcrdofd8PtMaeye3PMjvzu4xyM8fDtFLce1WC0e4W0l1UNyL22ZdgDDqL1Kpk586dqKmpweXLl/H48WM8fvwYv//+O6qrq7Fz587ukBF3795FXV0dxowZI9H2OTk58PX1hZmZGezt7ZGfn4/q6mr88MMPWL16dYf7FxUVITExsdVfampqF69Ectqq3Z9eUtvm9qcfZqK/jT7e8bKUh4hCJvS3AgBcjM+T63k7w6uiGpTXNcLfofMfK80KYm6gPfpZ6slV4Tqa6gAA0kpkYw+WB38mFqC6gYdpPjYdbiv2vo9p+75vibS/S/MHy/UX4v08iqIrY9AZpJ4J3L17Fz/++KOI+cfJyQnbtm3D3LlzZSpcM5GRkRg/fjy0tLRElm/YsAECgQDOzs5YvXo1Bg1qsu05OzsjKioKLBYLubm5sLW1xYEDB7BkyRLo6Oh0eL6ff/4ZR44c6ZZrkZT2aveLM/Hsv5oMnoDCpnH95J65bW2gDR87Q9x4UYg6Lk/pptYtiUorBQD4t2HbV3YcTJpmAmnF3fNC6A7ORmdDQ42GiQM6DmuU9r7vCrZGTLha6OJWchH4AkppotvkOQZAJ5SAhoYG6upa1+mor69vFS4qC8rKynDr1i2cOXNGZPn+/fuFZp3IyEgsXrwYly5dgrW1NdavX48dO3bg7Nmz2LBhA7Kzs1FQUIDAwECsWbMGBQUFGDt2LObMmSP2nDNmzEBISEir5ampqVi7dq3Mr1EcA9nia9uIMwFEZ5ThSkIBRruZw99RMS+3d/pb4XluJV7kV8HHTnlLSj9KLwWNBvh2YSagSBxMWKDRVEcJZJfV4UFqKcZ6WMCI1XGlG1978b+LTxvLu8ooN3McvvUaMVnlbZ5b3sh7DKQ2B40aNQqbNm3C3bt3UVVVhaqqKty5cwdbtmzB6NGjZS7gpUuXYGdnh/79+4ss9/HxgZaWFrS0tDBr1iy4ubnhzp07AABra2t8++23OHfuHMaOHYt9+/Zh8+bN+Oabb9CnTx/88MMP+Omnn/D69Wux5zQzM4O7u3urPyen7ov8aElRFQdmeppiE4/M9bSw42ICGvlNIXEUReGvlGIwGWrYMNZVLvKJ493+VrizbjjoNBpOPcxAUn4VOI3KlUtBUU3+ADdLPehrq45TtSVaGmqw0tdGWjeZBmRNVlkd3vGyxL98bSXavtnB25JmB293oIwmIQcTMUmH3TgGUn+6b9myBXv37sWyZcvA5zc95GpqaggNDcWGDRtkLmBkZCRCQ0M73I5Go4GiWodQnj59GsOHD4etrS1SUlIQFhYGBoMBFxcXpKSkoE+fPjKXuSvw+AJ8+H+xqOXy8H+LRcM87Y1Z2P3HC8TnVKCqvhEFVRxEZ5TjLRdTvBfkAEOmLGoKdg5thhp2n3shlSNb3qQW16Ckhot3+8su21IROJqy8CSjDAIBBbqSmDD+SXM/35TCaoQNsYe7lWRlj+XtePe01oe5niauvyjExrGuSlEEU0tDDZvG9cNMPzZe5Fd1ez9kqZWAtrY2wsPDhWYWALC1tQWLJXstlZiYiNevX2PixIkiy/Py8pCXlwcvLy8AwPnz55GQkIA9e/aIbFdYWIgrV64gIiICAGBjY4NHjx7B29sbCQkJWLBggcxl7ipf3HiFR+llmB9oB10tDfSz1BCxA+541x01DTzs/r11gTh5RwW1RBWa0EelNdla/R2VY9rfWZxMdXD3VQnyqziwNtBWtDit6ExkW0vaiszqDmg0Gkb2M8eZR1lILa5FH7OOfYbdTRWnEcGf3cYHw/pg+fDu/0jtdKApi8WCq6srXF1du0UBAE2zgLfeegsmJiYiy+vq6hAeHg5/f38EBQXh3Llz+Prrr8Fmi2Z67tmzB2vXroWGRtPUf+nSpYiKisKwYcMQEhICT0/lSm7638siHLn9Gl42+tg0vp/YbTTU6Cio5Mg1ekAS2nNmKQuP0ptk8VMS229neeMcVs4IIWkj2xSNspmEbicXobaBD32mfEyWEs0EZs+ejaNHj0JPTw+zZs1qd8r0TwduV9i6davY5X369MGFCxc63P/QoUMi/7e0tMQvv/wiE9lkTTWnEVcTCmCsw8CXswa2m3Ql7+gBSZC3M0taKIrCo7RSuFrowlACB6Uy05wrkFZci6HOrTOeFY0y3p/tEehkDCMWA3XcztV3kjXXXhSCRgNG9ZNPzo1ESiAwMFD4NR0YGKgUdrOeQrPt9FF6GaYOssHm8f3aTWIClPOF217GpjKQUVqHouoGjPVQnqSgztKcK6CsX9bKeH+2h6a6Gm58/BZSi2tw6mFGt9vg26OBx8f/kovgbWsAMz2tjneQARIpgRUrVgj//eGHH3abML2NztpOlfGF2+zQWzDYAU8yyjDIzhAu5rpK4xR+1JwfoKAQWlliqacFLQ06UpXUHNQc3fLP+1pZPgj+CaeRjz1/KEdQw4PUUtRy+XLNYJbaJzBixAiUl7fuZFRVVYURIzouTkZ4Q2dtp80v3CurhiJ8ojuurBqqFFE4WhpqcDHXRXktF/delShcnpYI/QEqmh/QEjqdBgcTHaXNFbidXIR1b7vi4oohSnV/toUy+TCu/V2ldrQcy690qp/AP5vLA03JYkVFRTIRqrfQFdupPCMopIGhTkdUehkSciuxONgRGmqKL9NLURSi0krhbKYDEx3V6sTVFo4mLFxOyAenka9UL1eBgMJ/rr1EZX0j7q0PgZeMu2B1B8riwxAIKFx/UYg+ZjpCk588kFgJNJdRoNFo+O6778BkvqnoJxAIEB8fD2dnZ9lL2INRNduppAQ6GuNxehme5VRikJ34zGd5kl1Wj/xKDuYEiO8ToIo4mrJAUU1fscr0IfDXq2KkFtdiabCjUimn9lCW5zA2uwIlNQ2YLkGNJVkisRJ48OABgKavqujoaJESEerq6rC2tu6WZLGejKrZTiUl0MkY/735ClFppUqhBKLSVbtekDhaRggpkxL4/l461Og0zB9sr2hRJEZZfGzXEgsAQO4VTSVWAj/++CMAYOPGjdi8ebNEhdgI7aNOp2Hd266YE2CHhNxKuZcl7i682QbQVKfjYWopPpBDsktH6Giq4x0vS5VPEmuJo0lzhJDyOIdfFlTj7qsSTPCyhJUSJrG1RbOPbX6gPZ7+3bujj5mOXJ9DiqLwZ2IBLPS04NVBuW1ZI7XBdt++fUQByIiUwhoE77+N7LI6hZQl7i401dUwyM4Q0ZllaOAprn4Qp5GPF3mVKKziYMEQB5VqwtIRDqbKV030+3tNTaYWBjkoWBLp0dJQAw1AcXUDqjmNcn8OXxfVIKO0DqPczOVeCqRTZT/v3LmDP//8EwUFBWhsbBRZd+rUKZkI1huIzS5HA08AbSUuvdxZAh2N8SC1FPHZlQqJyOlq6QJlR09LAyY6mkhVklyBkpoGnIvLxUC2AbzbqICr7GhqqOGr/6WCTqMh0Mmk4x1kyLW/s5VHu8u/KZPUM4HTp09j9erVUFdXx6NHj8Bms0Gn05GYmKh0ZRiUnbisCgDAABXpECUNgU5N9vcHqSUKOb8yhf11F46mLKQV14gtnChvzkRlgcsTYGGQbNvMyhPbv9tXZpW1LpXf3VxLLICulrpC/FZSK4GIiAjs3bsXO3fuhIaGBpYsWYITJ05g4cKFqKqq6g4Zeyxx2RWwNtCGqW7PCFtsiZeNAZgMNTxMLe3W8zSbfFqWr27k8fE4Xfx5lamWUVdxMmWhmsNDSQ1XoXI08Pg4HZUJawNtjFHAl6ys0GaowVRXU+5KIL+yHvE5lQhxNQNDXf4h1VLbIQoLC4Vf/EwmE9XV1QCACRMmIDQ0FOHh4bKVsIdSxWnE6+IajPOUbytIecFQp8PH3ghRqaXdFsveVi/WbRPc2ixdrOrhty154xyuVeiHREphNQIdjeBtawh1JcgL6QpsIyYy5DxbvNFsCnJTTEkTqX8xKysrYVKYnZ0dbt++DQB4+vQpNDV73hdtd/EsuxIUBXj3QFNQM4GOxuDyBYjJbJ1hLgva6sWaVlwLV0s9uTYnUQRvwkQVEyHUPAuLzijH/MH2mNkD8jDsjJgoreWipkF+xeQ0NdQw2dsab/VVTDFAqWcCkyZNQnx8PLy9vbFkyRJ8+OGHOHXqFGpqaiRq4k5oIi676cXozTZQrCDdSLNf4GFaKQb3kb2jra1Mz4S8SnjbGcq1OYkiEJaUVoCfo6c63m2N/vYLlNbBzap78y84jXykFtegtoGHeYF2UFdQgyCplcCSJUuE/x42bBguX76MFy9egM1mo18/8TXwCa2Jy66AOp0mccclVcTDSg86murd5hfoKNNTWUtryApbIybU6TSFzARUoYlQZ2AbvXEOd6cSUCYl2mUDnqWlJaysrGBuLluH0IYNG+Dh4QFvb2/hX15ennB9SkoKpk+fjv79+2PChAmIjo4WrktOTsa4ceMQEBCAEydOiBx34cKFSEhIkKms0kJRFGKzKuBm1TPyAtpCXY0OPwcjxGVXdEutdnn3YlU2NNToYBszFZIroApNhDoD27hJCWR3s3NYmaLXpFYC27Ztw9mzZwEAjY2NmDFjBqZNm4bhw4fj7t27MhUuLCwMsbGxwj8rKyvheZctW4aRI0fiyZMnWLx4MZYvX47KykoAwIEDBxAWFoaLFy/i6NGjKC4uBtDUtN7BwQEeHh4ylVNacsrrUVrL7ZGhof8k0NEYPAGFJxmy9wvQAKx/2xWRywarRLXK7sDRRAdZZXVo5Lcu6tid+NopR70dWdNyJtCdKJMSlVoJ3Lp1C+7u7gCAGzduoKKiAg8ePMDKlSvxxRdfyFo+sTx+/BgcDgeLFi0Cg8HAxIkTYWNjg2vXrgEAcnJy4OvrCzMzM9jb2yM/Px/V1dX44YcflMJvEZtdAaBn5gf8E6FfoBtMQreSizB0/21U1HN7VMa1NDiZssATUN3+5fpPrAy1e+QszFRHE5rq9G5XAj5KpESl9glUVVXByKhJ0Dt37mDcuHEwMjLC2LFjhZVGZcXZs2dx9uxZWFhYYN68eZg6dSoA4NWrV3BxcQGd/kaHubq64tWrVwAAZ2dnREVFgcViITc3F7a2tjhw4ACWLFkiUcmLoqIi4eyhJampqTK5rtisZqewamZWSkM/Sz3oaanjYZrslcDF+Dxw+QK4WfZcv0pHvOk3XCu38sNFVRys+DEWX84e2OR4z+w5jnc6nQa2EbPblaqVgZbSFI+UWglYW1sjPj4eBgYG+Ouvv4Rf/xUVFdDSkl07tLlz52LdunXQ19dHdHQ0Vq5cCV1dXYwZMwa1tbXQ1dUV2V5PT0+Ys7B+/Xrs2LEDZ8+exYYNG5CdnY2CggIEBgZizZo1KCgowNixYzFnzhyx5/75559lrtBaEpddAQOmBuyNmR1vrOKo0WkIcDTGzeQiVHMaO2ydKSnVnEbcTC6Cn70RLPTl04ZPGWl+8aeV1ACQT6LWnstJeJxRhmc5FRjRzxz9ujmKRt6wjZi486oYfAEFtW6I2EkrrsGas/E4Ns8Hi4YqPnpNaiXwwQcfYN26ddDU1ISLiwt8fX0BAPfv34ebm5vMBGs2OQGAv78/Zs+ejatXr2LMmDFgsVioqRGNiKiurgaL1aRFra2t8e233wIA+Hw+5syZg/379+Obb75Bnz59sG/fPoSGhiIgIAB9+rSucjljxgyEhIS0Wp6amoq1a9d26boaeHwk5lUh0NG41/RqDnQyxrUXhXiSUYYQV9m8qK4lFoLLE+DdAVYyOZ6q4ijnQnIPUktwIS4PI1zNMEJOjdDlja0RE418CgVVHFjLuBoqRVHYeiEBsdkVyCqrxSA7I4VHU0mtBCZMmAB/f38UFRXB1dVV+CLz9/cX++KUFXQ6XVgjxdnZGcePH4dAIBCahJKSkjBz5sxW+50+fRrDhw+Hra0tUlJSEBYWBgaDARcXF6SkpIhVAmZmZjAzM+uW60jKrwaXJ+jR+QH/pKVfQFZK4GJ8HtTpNIzz6JkZ15JizGJAT0tdLrkCjXwBtl1IhKY6Hdvfce94BxWF3SJXQNZK4GJ8Hu6/LsV0HxsMasMvIG86FSJqamoKd3d3qKm9mbp4eXnByclJZoJdvnwZNTU1EAgEiI6ORkREBEaNGgUA8PPzA4PBwPfffw8ul4tLly4hJydHuL6ZwsJCXLlyBQsWLAAA2NjY4NGjR6itrUVCQgJsbW1lJq+kxP3tD+gNTuFmXMx0YcRiyMwvUFrTgHuvSxDsYgpDFkMmx1RVaDQaHEzl02/4+3vpeF1Ug+XD+ghDKXsizUpA1n6BKk4jdv+RBEOmBjaMVZ6cKolmArNnz8bRo0ehp6eHWbNmtWvGOHPmjEwEO3PmDLZt2wY+nw8rKyusWrUK48ePBwBoaGjg6NGj2LJlCw4dOgRbW1t8+eWXMDAwEDnGnj17sHbtWmhoNNmhly5dipUrV+Knn37ClClTFFL1NK4XRQY1Q6fTENTHGBSAei6vy6WzLycUgC+g8G7/3m0KasbJhIX47ApUcRq7rWdCNacRKYXVcDbTwdK3VLdSqCQ0KzhZRwgd+PMliqsbsH+KF4yU6ONFoqcxMDBQ+CIdPHhwtwrUTEfKpG/fvvjll1/a3ebQoUMi/7e0tOxwn+4mLrsCjiYsGDCV5yaQB7sneeJlYTXORufAz6FrTrBLcXnQ0qBjlFvPtElLS0u/gKw/LjiNfKQV1+Bxehlm+rGx892eH4bbHSWlCyo5KK/jItDRCFMHybeHcEdIpARWrFgh9t8E6Sir5SKjtA6h3tYdb9yD4DTysfNSokxS5HMr6vE4owzjvSzB0ux5zXg6gzBCqLhGpkpAmUobyBNthhrMdDWRKQMl0KxEH6WXYV6gPdws9eTeOawjJHqKBALJsxFbxu4TRIlvNgX1IqcwINs6M7/HN5UOIaagNzTPBGRdcqCn1geSBLYRs8vOdlVRohIpATc3N4nDGZOSkrokUE+mN2UKt6S9FHlpXyYX4/Ogq6WOYQoqu6uM2BuzQKPJPkxUlr+bqsE2YiI6s7xLuS2qokQlUgKkb7BsiM0qh6Y6Ha4WynMDyIOOqn1KSmpxDRLzqjDdxwaa6srzJaVotDTUYKWvjVQZVxOV1e+mitgKI4Tq4WbVOSWgKkpUIiXg5+fX3XL0eAQCCvHZFfCw1ldICzlF4mDCwpRB1vjtaddS5C/GNZuCepdPRRL6WujAgMmAQEDJzOZsb8xSmtIG8kYWJaVVRYlKpAQePnwIX19fqKur4+HDh+1uGxgYKBPBehrppbWo4vB6nSkIaPpSbW7wEpVWCk9rfXhY60tlF6UoCnpa6pjlxxYmnxHecGD6ALwuqkHEo0z4yqgEwY2kQqx72xVhgx0Ql13eY+oDSYIsSko3lzpXdiUqkRJYsGAB7t+/D2NjY2HilThoNBrxCbRBbFYFgJ7dSaw9mhu8VNU34uSDDEzztUWws2R2fU4jH6+LakCn0zDD1xaNfAHU6D3/RSQpnEY+dv/+QuYOyB/up2PrhQQ82jQCnja9q0ifnQxKSqvTaVj/tivmBNghIbdSaZWoREogOTlZ7L8JkqOvrY53vCx75UygJYYsBi49y4exjqZESkBVIiwUSXc4ILNK6xCTVYFJA6x6pf/FVLeppHRXwkST8qsx9esH+M80L8wNtJedcDJGauP0+fPnweVyWy3ncrk4f/68LGTqUTQ3484pr0fYEHuY6GgqWiSF4mymA2sDbdxMLhTWgmoPZerApKx0R4OSi/FNYz5xQO/0v9BoXS8p/TijDA08gcwq53YXUiuBjRs3Cks2t6S2thYbN26UiVA9heav2HGH7mHnpReYcvQhNp9/Dk4jX9GiKQwajYYQVzNkl9VLFM2iTB2YlBVZOyApisL5uDwYsRgIcjbpimgqDduIiZzyOvAFHX+siONJehnoNGCQnXL3DZFaCVAU1SpngKIoxMXFQV+/d9kNO4J8xYonpF9ThdZbyUUdbqsqERaKpDn6qiVdcUAm5VfjdVENxntaQkOtd0WytaRlSWlpoSgKTzLK4Galp/QzAYnz7pvLRtNoNAwZMkTsNosXL5aZYD0BVYkTljeBjsbQ0qDjZlIRlgS3X3nWzpipEhEWiqQ5+uq9IQ54nF4Gb7YhXC10O+0zuRDXbArq3VnZXSkpnVZSi9Jarkr0u5BYCfzwww+gKArvvfcevvjiC5GvfnV1dVhZWcHaunfaD9uCfMWKR0tDDUOcTPC/lGJU1jdCX7vtL6Vfn2Zj3duuWDDEAbFZvStMURq0NNTgZqmHh6ml+PVpNsInenTqOAIBhYvxebAx1FZ6M0Z3Y9ciTFTasOQn6U0fgH4q8KxLrASa4/9v3rwJKyurXtMVqyuoSpywIgjpZ4abyUW4k1KMd9qoA9TA4+O/N17j1IMsXPsoGB7WxNzYHjQaDYVVHEREZWGGD7tTYZ1PMsqQX8nB8mFOvf4Zb54JZJZJb759/LcVQBU++CRWAr/++qvY5To6OrC3t4erq6vMhOopqNFp2DDWFbP82XiRV0W+YlswvG+TX+B2clGbSuDK8wKU1nKx9C1Hpau8qKyMcrPAt3fTcf1FQaeUwPm/s7J7a1RQS2yEJaXrpd73cXoZHE1YMNVV/mhAiZXAV199JXZ5TU0Nqqqq4Ofnh0OHDrVq7NKbufw8H+t+fYZDM72VOk5YEVgZaKOfpR7+l9J2Q+/TUZnQVKdj2iD5d4BTVQayDWDI1MD1pCJ8PLqvVPtyeQJcfp4PVwtd9LXQ7SYJVYfmktLSJozlV9Yjp7weM3xU476V2PV/69YtsX+PHz/G9evXwefzcfDgQZkJxuVysXnzZoSEhMDb2xvjx4/HxYsXhetDQkLg5eUFb29v4fpmkpOTMW7cOAQEBODEiRMix124cCESEhJkJmdbUBSFb++mgUZTDbugIghxNUVZLVfYba0liXmVeJpZjnf6W/X6FpLSoK5Gx3BXMyTlVyGnXLqX152/fTRkFvCGzuQKPP7bH+DroBrPvUziv2xtbbFmzRrcu3dPFocDAPB4PJiZmeHkyZN4+vQpdu7ciZ07dyI2Nla4zZEjRxAbG4vY2Fj88ccfwuUHDhxAWFgYLl68iKNHj6K4uBgAcOnSJTg4OMDDo3NOM2mISitDQm4Vpg2yJS+xNghxfWMS+icRUZkAgHmBdnKVqScw+u+OazdeFEq13/m/o4JUIaJFXrCNmCir5aKa0yjxPs1Rgary8Sez1kympqaoqKiQ1eHAZDKxatUq4f99fHwwcOBAxMbGwtvbu919c3Jy4OvrCzMzM9jb2yM/Px9aWlr44YcfJCqLXVRUJFQcLUlNTZVY/u/uNc0C3gtykHif3sYAW0MYMjVwK7kI/x7zxnRRWd+I87F56G9rAC8bA8UJqKIMdTYFQ52OG0lFCBsi2f1X08DDjaRC+NkbSR0O2ZN5U0hO8pLST9LLYa6nCVsj1RhHmSmBp0+fwta2+2xgdXV1SEhIwLx584TLNmzYAIFAAGdnZ6xevRqDBg0CADg7OyMqKgosFgu5ubmwtbXFgQMHsGTJEujo6HR4rp9//hlHjhzptKypxTW4kVSEUW7mJBKoHdToNAzra4ZzsbnIr6yHpX7TQ/Pr0xzUN/IxN4DMAjoDS1MdQ5yMcfdVicTN5zNLazGqnzmp0PoPpC0pXVHHxcvCakzwslSZ6CqJlUBbJaRra2uRlJSE06dPY/369TITrCUURWHjxo3w8vJCUFAQAGD//v1Cs05kZCQWL16MS5cuwdraGuvXr8eOHTtw9uxZbNiwAdnZ2SgoKEBgYCDWrFmDgoICjB07FnPmzBF7vhkzZiAkJKTV8tTUVKxdu7ZDeb+7lw4AWDzUsbOX3GsIcW1SAreTizHLnw2BgEJEVCYMmBqY4GWpaPFUlpFu5rj9shj/e1ncbivOlo3k5w+2h7sVCcNtyRslIFmYaHRGOQDAX0X8AYAUSqCtEtIsFgt2dnbYtGkTJk2aJCu5hFAUhe3bt6OwsBDff/+9ULv6+PgIt5k1axYuX76MO3fuYObMmbC2tsa3334LAODz+ZgzZw7279+Pb775Bn369MG+ffsQGhqKgIAA9OnTp9U5zczMYGZm1il5y2q5+O1pDvrb6MPXvncn20hCsIsp1Og03Eouwix/Nu6nliC9pBZLgx1JKG0XGNnPHJvPJeDGi8I2lQCp0NoxbClLSjf7A1TFKQxIoQQUUUKaoijs3LkTL168wIkTJ8BkMtvclkajia1Kefr0aQwfPhy2trZISUlBWFgYGAwGXFxckJKSIlYJdIWIqEw08ARYONRRZaaDikRfWwOD7Axx/3UJOI18nH6YCRoNmO1PTEFdwVxPC/1t9HH7ZREa+QKxNYBUpQeuImkuKS1prsDjjDLoa2vAxUx1QmyVujrUrl27EB8fj++++07Elp+Xl4fo6GhwuVxwuVycPXsWCQkJQlNRM4WFhbhy5YpwFmNjY4NHjx6htrYWCQkJMvdhcBr5OPUwA9YG2hjnYSHTY/dkRvczx8h+ZnhZUA0tDTpG9jMTOuQInWdkP3NUc3jCEgb/hFRo7RhpSkrXc/l4nlMJHztDlUpulJljWNbk5ubixx9/BIPBwLBhw4TLly5dipEjRyI8PBxZWVnQ0NCAk5MTvv76a7DZbJFj7NmzB2vXroWGhoZw35UrV+Knn37ClClT4OnpKVOZL8bloaSGiy3j+0G9F1dflJZZAWz0zzNATFY55gXaw9WCfIXKgpFu5jhwPQXXXhRicJ/WJaF92qgNpAqlDuSJnTETf7WT1NhMbFY5eAJKpUxBgBIrAWtra7x8+bLN9RcuXOjwGIcOHRL5v6WlJX755ZcuyyYOiqJw/F4adDTVMd1XNTIFlQFOIx9bzycQu3Q34GqhCxtDbdxIKsT2d9xEzJOcRj7M9LRIbSsJaFlSur3w2eZ6QW0VjlRWlFYJqBp/pRQjpbAGi4IcJArJIzRB7NLdB41Gw8h+5jjxIAMvC6uFMyyKorDu12dIK6lBxEJ/LBrqiOiMMlLbqg2EheRKa9tVAk8yyqClQYenihU6JDYLGfHdvXSo0WlYQJLDpILYpbuXUX9nD19PfJM9/NX/UnExPg9W+trQ09JAP0s9zA20Rz9LPaIAxNCsBNrzCzTyBYjJrMAAWwMw1FXrtSrVTIDH4+Gvv/7Cs2fPUFpaCgAwNjZG//79ERwcDHX13jmxSMqvwt1XJXinvxXJtpQS0nOhe/FzMIKxDgMNvKaWpnHZFTh08xVcLXTx+YwBKuXAVBR2Rky842UJq3ae7cS8KtQ38lWmVERLJH5rZ2RkYOnSpSgsLISXlxeMjJouNisrCydPnoS5uTm++eYb2Nvbd5esSsvxu03JYYvILEBqmlsj/vaU2KW7Aw01Oq5/9BZSi2tw8kEG3K30cHfdcFCgwNLsnR9t0mJjxMS8wfZIzKuEiY6mWJNZaU0DPhjeB+M8VS/BUeK7YNu2bXBzc8P58+ehrS2qEevr67Fp0yZs27ZNoto8PYmiKg4uxufCz94I/W0NFC2OytHcGnFRELFLdwecRj72/PFC1Pk70Bp7Jss2Mq6nIklCHaeRD0MWAyY6DAgoCpxGvkrdvxIrgbi4OERGRrZSAACgra2NDz74AFOmTJGpcKrAyYcZaORTWDSUzAI6i5aGGvpZ6hFHcDcg1vEek4tFQ4njXRI6ClzoCVnXEnswTE1NERcX1+b6uLg4GBv3ruJTdVweIqKyYG/MxIh+5ooWh0BoBXG8d422xi8qrRTXXhTgRX6VWCWRXiJ9S0pFIfFMYPny5di+fTsePXqEgIAA4Qu/tLQUjx49wuXLl7Fr165uE1QZ+e1pDirrG/Hv0S7tJpEQCIqCON67RlvjN8DWADUNPMSLaYgENClZVZlpSawEpkyZAisrK/z44484dOiQSHSQl5cXvv32W2Ez+t4AX0Dhu3vpMGBqYMogG0WLQyCIhTjeu0Zb49ccTmuiI76HsCopWanCAwIDA3vVi749biQVIqO0Dh8MdwKTQaIsCMoJcbx3jY7Gryco2U69vbhcLsrLm+pmGxoagsHofe0Tj99NA0ONjvmkgTxBySGO967R3vj1BCUrlRKIjIzE6dOnkZKSAoFAAACg0+lwcXHBvHnzMHny5G4RUtl4WViNJxnlmDrIBmZ6WooWh0AgKBBVV7ISK4FvvvkG33zzDcLCwrBp0yZhslhZWRkePnyI3bt3o7i4GEuWLOk2YZWF87G5ALSwkCSHEQgEFUdiJRAREYH9+/dj5MiRIsudnJzg6+sLNzc37Nixo1cogfuvSzDUZ4DKan4CgUBoRmIlUF1d3W4TFhsbG9TWqk5sbFcI6mOChcGkfzCBQFB9JE4WGzx4MHbv3o2srKxW67KysrB3714MHjxYpsJ1RFVVFVatWgVvb28MHToUZ86cAdCksBYuXAgfHx+sWbMGfD5fuM9XX32FEydOdOm87/a3UslCUQQCgfBPJJ4J7Nq1C6tXr8aYMWNgYWEh4hMoKCiAj4+P3JPFdu3aBT6fj7t37yIrKwsLFiyAk5MTnj9/DkNDQzx48ADz58/H9evX8fbbbyM7Oxt37twRKovO8u9fn+FBubZKpYYTCASCOCRWAiYmJoiIiEBycjKeP3+OsrIyUBQlTBbr27dvd8rZirq6Oly9ehXnz5+Hjo4O3NzcMHnyZPz2229gMpnw9fUFg8GAj48PsrOzAQC7d+/Gpk2boKbW/ou7qKgIxcXFrZYnJSUBAGjVhYi8+QjDzbgqFQ9MIBB6DqmpqQCAhoaGLh1H6jwBV1dXuLq6dumksiAjIwMA0KdPH+EyV1dXnDhxAtOmTcPjx48xadIkxMTEYPHixbh8+TKsra3h5eXV4bF//vlnHDlypM31GtFNM4k1t7t2DQQCgdBVXrx4gYEDB3Z6f5mlulZWVuL27duYNGmSrA7ZLnV1dWCxRL/C9fT0UFtbi6lTpyI5ORlTp07FW2+9hUGDBmH+/Pk4efIkPv/8c0RHR8PZ2RmbNm0Sm+g2Y8YMhISEtFoeHx+PnTt3Yu/evUqhCJWR1NRUrF27Fp999hmcnJwULY5SQsaoY8gYdUxycjI2bdrU5fGRmRLIz8/Hxo0b5aYEmExmq2ik6upqsFgsaGpqIjw8XLg8PDwcixYtQkxMDJ4/f46IiAhs2bIFv/32G2bOnNnq2GZmZjAzM2vz3K6urnB3d5fdxfRAnJycyBh1ABmjjiFj1DF6el0LVZdYCTTb1dsiPz+/S4JIS3MHs9TUVKEmTE5OhrOzs8h2CQkJyM7OxtatW3Hs2DF4eHiARqPB09MTycnJcpWZQCAQlA2JlcCoUaNAo7VdLpmiqHbXyxomk4kxY8bgv//9L/bu3YucnBxERkbiiy++EG4jEAiwb98+fPLJJwCachnOnDkDLpeLJ0+ewM3NTW7yEggEgjIisRIwNDTExx9/3GYuwOvXr/H+++/LTDBJ2L59O7Zs2YKhQ4eCxWJh5cqVIlVOz5w5g6CgIGGS2+jRo3H9+nUEBgZiwIABmDFjhlzlJRAIBGVDYiXg6emJgoICWFtbi11fXV0NiqJkJpgk6Onp4dChQ22unzt3rsj/1dXV8fnnn3e3WAQCgaAySKwElixZgrq6ujbXs9nsHt9k3tTUFCtWrICpqamiRVFayBh1DBmjjiFj1DGyGiMaJe/PdwKBQCAoDRLXDmqPsrIynDx5stf0EyAQCISeQqfzBBobG3H79m2cO3cOd+/ehb29fasy0wQCgUBQbqRWAs+ePUNkZCSuXLkCU1NTpKWl4fjx43KvIEogEAiEriOxEjh27BjOnTsHHo+HcePG4dSpU+jbty/c3d3bza4lEAgEgvIisU/giy++wPDhw/H777/jo48+knvVUEXTVu8CQhMbNmyAh4cHvL29hX95eXmKFkuhREREIDQ0FB4eHvjoo49E1qWkpGD69Ono378/JkyYgOjoaAVJqVjaG6OQkBB4eXkJ76fx48crSErFwuVysXnzZoSEhAjH4eLFi8L1Xb2XJJ4J7N69GxcuXEBQUBBCQkIwYcIEkcSsnk5bvQsCAgIULZrSEBYWhn//+9+KFkNpMDMzw/Lly/HgwQOUl5cLlzc2NmLZsmWYMWMGIiIicOXKFSxfvhzXr1+Hvr6+AiWWP22NUTNHjhxBcHCwAiRTHng8HszMzHDy5ElYW1sjJiYGS5cuha2tLTw8PLp8L0k8EwgNDcXJkydx4cIF2NvbY/fu3Rg6dCgEAgESExNFunf1NJp7F6xevbpV7wICoS1Gjx6NkSNHwtDQUGT548ePweFwsGjRIjAYDEycOBE2Nja4du2agiRVHG2NEeENTCYTq1atgq2tLeh0Onx8fDBw4EDExsbK5F6SOkTUysoKy5Ytw59//okvv/wS06ZNw549ezB48GCsX79e2sOpBG31Lnj16pWCJFJOzp49Cz8/P7z77rv49ddfFS2O0vLq1Su4uLiATn/z+JH7STwbNmxAQEAA5s6di6dPnypaHKWgrq4OCQkJcHZ2lsm91KVS0gMHDsTAgQOxZcsW3LhxAxcuXOjK4ZSW9noXEJqYO3cu1q1bB319fURHR2PlypXQ1dXFmDFjFC2a0lFbWwtdXV2RZXp6eqiurlaQRMrJ/v374eHhAQCIjIzE4sWLcenSpTZL1/QGKIrCxo0b4eXlhaCgIDx79qzL95JMksUYDAbGjRuHb775RhaHUzra611AaMLd3R1GRkZQU1ODv78/Zs+ejatXrypaLKWExWKhpqZGZBm5n1rj4+MDLS0taGlpYdasWXBzc8OdO3cULZbCoCgK27dvR2FhIT7//HPQaDSZ3EsyUQI9nZa9C5oR17uA8AY6nS73goKqgrOzM1JSUiAQCITLkpKSyP3UATQardfeUxRFYefOnXjx4gWOHz8OJpMJQDb3ElECEtCyd0FNTQ2Sk5MRGRmJ0NBQRYumNFy+fBk1NTUQCASIjo5GREQERo0apWixFAqPx0NDQwN4PB4EAgEaGhrQ2NgIPz8/MBgMfP/99+Byubh06RJycnJ65Xi1NUZ5eXmIjo4Gl8sFl8vF2bNnkZCQgKCgIEWLrBB27dqF+Ph4fPfdd9DR0REul8W9RArISUhVVRW2bNmCu3fvgsViYdmyZZg9e7aixVIaZs+ejZcvX4LP58PKygpz5swR27qzN3H48GEcOXJEZNnkyZPxySef4OXLl9iyZQtevnwJW1tb7NixA76+vgqSVHG0NUaLFi3CmjVrkJWVBQ0NDTg5OWH16tXw9/dXkKSKIzc3FyEhIWAwGFBXf+PGXbp0Kd5///0u30tECRAIBEIvhpiDCAQCoRdDlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL4YoAQKBQOjFECVAIBAIvRiiBAi9lpCQEPzyyy9yP29tbS2Cg4ORnZ3d6WM8evQIffv2BY/H67I8ubm5GDp0KKmK20shGcOEHklH7U9PnToFZ2dnMJlMaGlpyUmqJo4ePYq0tDR89tlnnT4Gl8tFZWUlTE1NZSLTunXrYG9vj+XLl8vkeATVgSgBQo+kuLhY+O9vv/0Wz549w+HDh4XL9PX1wWAw5C6XQCBASEgI9uzZgyFDhsj9/G1x7949bNmyBbdu3RJpUELo+ZBfm9AjMTU1Ff5pa2tDQ0NDZBmDwRAxB+Xk5KBv3764du0aQkND4eXlhbCwMJSXl+PKlSsYOXIkfH19sXfvXpFyxmVlZVizZg18fHzg7++PNWvWiO2V20xcXBzKy8tFCqFFRkYiODgYf/zxh7CZeHh4OPh8Pr744gv4+/sjODhYpGnTP81Bhw8fxsyZMxEREYGgoCD4+/tj//79QlkpisKBAwcwdOhQeHp6YsSIEfjpp5+ExwsICEB5eTni4uJkMv4E1aFLncUIhJ7Gl19+ic2bN0NXVxerVq3CqlWrwGKx8NVXXyEvLw8rVqxAQEAAQkJCAAArV66Eubk5zpw5AxqNhv/85z9Yu3Ytjh8/Lvb4sbGxcHFxEakGCQAVFRW4dOkSvv76a+F50tPT4enpiZ9++glXr17F1q1bMXToUBgZGYk99suXL8Fms3Hy5Emkp6dj9erV8PHxQUhICK5cuYLff/8dX3zxBSwsLJCdnS3SjERdXR2urq6IiYnBwIEDZTSaBFWAKAECoQVLly6Fn58fAGDq1Kk4ePAg7t+/D2NjY7i4uMDf3x+PHz9GSEgInjx5gvT0dJw4cUL4Ug8PD0dwcDAKCgpgYWHR6vj5+fli7fhcLhe7d++GiYmJ8DyFhYX46KOPAABLlizBsWPHEBcXJ1RA/0RdXR27du2CpqYmnJyc4O/vjydPniAkJAQFBQWws7PDwIEDQaPRxLZoNDU1RV5eXqfHjqCaECVAILSgpUPZ2NgYRkZGMDY2Fi4zMTFBWVkZACAlJQVlZWVia7dnZ2eLVQJcLlesL8LIyAgmJiYi52nZO1ZNTQ0GBgbCc4vDzs4OmpqaIscoLS0FAIwePRrff/89xo4di+DgYIwcOVKo7JrR1NQEh8Np8/iEnglRAgRCC1qaaWg0WiuzDY1GA5/PB9AU6slms3Hs2LFWxzE3Nxd7fAMDA+Tm5rZ73vbO3V4cR3uy2tjY4Nq1a7hz5w7u3buH999/H5MnT8bWrVuF21dWVnYYVUXoeRDHMIHQSVxdXZGfnw8dHR3Y2dmJ/LUVdurq6oq0tDQ5S9oEk8nE22+/jd27d2P37t349ddfRdanpqbC1dVVIbIRFAdRAgRCJwkKCoKLiwtWrFiB6OhoZGdn4/79+yJf1//E398fRUVFKCgokKOkwLlz5xAZGYnU1FSkp6fj5s2bcHBwEK4vKChAYWFhr2zf2NshSoBA6CR0Oh3Hjx+Hg4MDVqxYgfHjxyM8PFzElv9PjI2NMXToUFy9elWOkgK6urr48ccfMXXqVEybNg0VFRU4ePCgcP3Vq1cRFBQEMzMzucpFUDwkWYxAkDOxsbHYsGEDLl++DDU1NUWLA4FAgLFjx2LPnj3w8fFRtDgEOUNmAgSCnPH29sb8+fNRWFioaFEAAEVFRZg3bx5RAL0UMhMgEAiEXgyZCRAIBEIvhigBAoFA6MUQJUAgEAi9GKIECAQCoRdDlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL+b/AcTf3RS0yumnAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set_context('paper')\n", + "sns.set_style('ticks')\n", + "fig, ax = plt.subplots(figsize=(4, 2))\n", + "sns.lineplot(df, x='time', y='utilisation', marker='.', markersize=10.)\n", + "ax.set_xlabel('Time (mins)')\n", + "ax.set_ylabel('A100 Utilisation (%)')\n", + "ax.yaxis.set_major_formatter(PercentFormatter(100.))\n", + "ax.set_ylim([0., 100.])\n", + "ax.set_xlim([0., 20.])\n", + "ax.axhspan(80, 90., color='green', alpha=.2)\n", + "ax.text(1.2, 110., 'Epoch 1')\n", + "ax.text(7.6, 110., 'Epoch 2')\n", + "ax.text(13.6, 110., 'Epoch 3')\n", + "fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "fig.savefig('utilisation.pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "zoobot39_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b66a1f3582d74cb0bea4c1dc8efc03cbe4c82a73 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 13:21:48 -0500 Subject: [PATCH 104/307] force commit sh --- only_for_me/narval/gpu_split.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 only_for_me/narval/gpu_split.sh diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh new file mode 100644 index 00000000..03a0629e --- /dev/null +++ b/only_for_me/narval/gpu_split.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --time=0:05:0 +#SBATCH --nodes=1 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=10 +#SBATCH --mem-per-cpu 4G +#SBATCH --gres=gpu:v100:2 + +nvidia-smi + +PYTHON=/home/walml/envs/zoobot39_dev/bin/python +# source ~/envs/zoobot39_dev/bin/activate + +# mkdir $SLURM_TMPDIR/cache +mkdir /tmp/cache + +export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. +# echo "r$SLURM_NODEID master: $MASTER_ADDR" +# echo "r$SLURM_NODEID Launching python script" + +REPO_DIR=/project/def-bovy/walml/zoobot +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py + From 7cd11a74436ebbb1222e890c8dde44f222912583 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 13:24:50 -0500 Subject: [PATCH 105/307] request less --- only_for_me/narval/gpu_split.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 03a0629e..62c067b7 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -3,7 +3,7 @@ #SBATCH --nodes=1 #SBATCH --ntasks=2 #SBATCH --ntasks-per-node=2 -#SBATCH --cpus-per-task=10 +#SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:2 @@ -21,5 +21,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID Launching python script" REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 2 From 84eb5477a0f37273ef92811c265eaa78e492680c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 13:41:42 -0500 Subject: [PATCH 106/307] single gpu? --- only_for_me/narval/gpu_split.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 62c067b7..ccaa9818 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -5,7 +5,7 @@ #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -13,7 +13,7 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python # source ~/envs/zoobot39_dev/bin/activate # mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +# mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. @@ -21,5 +21,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID Launching python script" REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 2 +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 1 From 5ac6be9070ba0f6a090d5e8f5be0a81e18e0fe26 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 13:43:46 -0500 Subject: [PATCH 107/307] use tmpdir --- only_for_me/narval/gpu_split.py | 3 ++- only_for_me/narval/gpu_split.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index 6a93ad2b..9e7802d3 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -48,6 +48,7 @@ def configure_optimizers(self): def main(): logging.basicConfig(level=logging.INFO) + logging.warning('Script start') parser = argparse.ArgumentParser() parser.add_argument('--save-dir', dest='save_dir', type=str) @@ -67,7 +68,7 @@ def main(): save_dir = '/home/walml/repos/temp' else: - save_dir = '/project/def-bovy/walml/temp' + save_dir = os.environ['SLURM_TMPDIR'] schema = schemas.decals_all_campaigns_ortho_schema diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index ccaa9818..0c5d27b3 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=0:05:0 +#SBATCH --time=0:15:0 #SBATCH --nodes=1 #SBATCH --ntasks=2 #SBATCH --ntasks-per-node=2 @@ -20,6 +20,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" +echo 'Running script' REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 1 From f2c89b2e95adb319cf9d95cdf74ba9bd8afe4d73 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 14:08:28 -0500 Subject: [PATCH 108/307] it doesn't split! add my logging --- only_for_me/narval/gpu_split.py | 8 ++++---- only_for_me/narval/gpu_split.sh | 6 +++--- zoobot/pytorch/datasets/webdatamodule.py | 3 +-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index 9e7802d3..79aad30b 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -90,7 +90,7 @@ def main(): cache_dir=None # TODO pass through the rest ) - use_distributed_sampler=False + # use_distributed_sampler=False trainer = pl.Trainer( # log_every_n_steps=16, # at batch 512 (A100 MP max), DR5 has ~161 train steps @@ -104,7 +104,7 @@ def main(): max_epochs=1, default_root_dir=save_dir, # plugins=plugins, - use_distributed_sampler=use_distributed_sampler + # use_distributed_sampler=use_distributed_sampler ) # logging.info((trainer.strategy, trainer.world_size, @@ -115,8 +115,8 @@ def main(): trainer.fit(lightning_model, datamodule) # uses batch size of datamodule # batch size 16 - # shard size 16, 10 shards with 8 being assigned as training shards so 8*32 train images, 8*2 train batches + # shard size 16, 10 shards with 8 being assigned as training shards so 8*32 train images, 8*2=16 train batches if __name__=='__main__': - main() \ No newline at end of file + main() diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 0c5d27b3..715ca5e3 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --time=0:15:0 +#SBATCH --time=0:10:0 #SBATCH --nodes=1 #SBATCH --ntasks=2 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:2 nvidia-smi @@ -22,5 +22,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t echo 'Running script' REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 1 +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 2 diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 27e0c408..bec1c7f4 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -81,8 +81,7 @@ def make_loader(self, urls, mode="train"): dataset = ( # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0 - # , nodesplitter=nodesplitter_func + wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func ) .shuffle(shuffle) .decode("rgb") From a21463f78dc9ddf95107697458f7e653630c27cc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 14:17:39 -0500 Subject: [PATCH 109/307] make once and exit --- only_for_me/narval/gpu_split.py | 10 +++++----- only_for_me/narval/gpu_split.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index 79aad30b..fe975215 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -64,16 +64,16 @@ def main(): parser.add_argument('--seed', dest='random_state', default=1, type=int) args = parser.parse_args() - if os.path.isdir('/home/walml/repos/zoobot'): - save_dir = '/home/walml/repos/temp' + # if os.path.isdir('/home/walml/repos/zoobot'): + save_dir = '/home/walml/repos/temp' - else: - save_dir = os.environ['SLURM_TMPDIR'] + # else: + # save_dir = os.environ['SLURM_TMPDIR'] schema = schemas.decals_all_campaigns_ortho_schema shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=32) - # exit() + exit() # webdataset_utils.load_wds_directly(shards[0], max_to_load=None) # webdataset_utils.load_wds_with_webdatamodule(shards, label_cols=schema.label_cols, max_to_load=None) # exit() diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 715ca5e3..e46faabc 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -5,7 +5,7 @@ #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#### SBATCH --gres=gpu:v100:2 nvidia-smi From 6181d407e9aa64c99d22bbd22482a693f218287a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 14:21:47 -0500 Subject: [PATCH 110/307] 1 task only --- only_for_me/narval/gpu_split.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index e46faabc..e377194b 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH --time=0:10:0 #SBATCH --nodes=1 -#SBATCH --ntasks=2 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G #### SBATCH --gres=gpu:v100:2 From 178e6fd73348664a67beb6ac353a680627072d10 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 14:35:45 -0500 Subject: [PATCH 111/307] run on premade shards --- only_for_me/narval/gpu_split.py | 6 ++++-- only_for_me/narval/gpu_split.sh | 15 +++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index fe975215..db2b5ea6 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -2,6 +2,7 @@ import logging import os import time +import glob import torch import torch.nn.functional as F @@ -72,10 +73,11 @@ def main(): schema = schemas.decals_all_campaigns_ortho_schema - shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=32) - exit() + # shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=32) + # exit() # webdataset_utils.load_wds_directly(shards[0], max_to_load=None) # webdataset_utils.load_wds_with_webdatamodule(shards, label_cols=schema.label_cols, max_to_load=None) + shards = list(glob.glob('/home/walml/repos/temp/mock_shard_*_32.tar')) # exit() train_shards = shards[:8] diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index e377194b..4529e215 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,11 +1,16 @@ #!/bin/bash #SBATCH --time=0:10:0 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --nodes=1 # This needs to match Trainer(num_nodes=...) +#SBATCH --ntasks-per-node=2 # This needs to match Trainer(devices=...). Presumably is PER TASK. +# this .sh always runs once, but the srun command (for parallel slurm tasks) gets called twice (ntasks). Lightning then hooks onto that by detecting it is in a slurm environment. #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#### SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:2 # This needs to match Trainer(devices=...). Presumably is PER TASK. Total GPU = nodes*devices + +# https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_1.html#setup-the-training-script +# https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html + +#### SBATCH --ntasks-per-node=1 nvidia-smi @@ -16,6 +21,8 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python # mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +# instructed by Compute Canada, not lightning + # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. # echo "r$SLURM_NODEID master: $MASTER_ADDR" # echo "r$SLURM_NODEID Launching python script" From 2c8d07eb74cf7a67ec4e89ce5c9e515367439468 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 17:25:32 -0500 Subject: [PATCH 112/307] it worked - 8 batches? try 4 gpus --- only_for_me/narval/gpu_split.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 4529e215..0a37524b 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,11 +1,11 @@ #!/bin/bash #SBATCH --time=0:10:0 #SBATCH --nodes=1 # This needs to match Trainer(num_nodes=...) -#SBATCH --ntasks-per-node=2 # This needs to match Trainer(devices=...). Presumably is PER TASK. +#SBATCH --ntasks-per-node=4 # This needs to match Trainer(devices=...). Presumably is PER TASK. # this .sh always runs once, but the srun command (for parallel slurm tasks) gets called twice (ntasks). Lightning then hooks onto that by detecting it is in a slurm environment. #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 # This needs to match Trainer(devices=...). Presumably is PER TASK. Total GPU = nodes*devices +#SBATCH --gres=gpu:v100:4 # This needs to match Trainer(devices=...). Presumably is PER TASK. Total GPU = nodes*devices # https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_1.html#setup-the-training-script # https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html @@ -29,5 +29,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t echo 'Running script' REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 2 +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 4 From c5864a9802dd211109a4f1e5474163519a30545b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 17:32:53 -0500 Subject: [PATCH 113/307] 4 gpu works! how about 2 nodes, 4 gpu each? --- only_for_me/narval/gpu_split.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index 0a37524b..d4f318d4 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --time=0:10:0 -#SBATCH --nodes=1 # This needs to match Trainer(num_nodes=...) +#SBATCH --nodes=2 # This needs to match Trainer(num_nodes=...) #SBATCH --ntasks-per-node=4 # This needs to match Trainer(devices=...). Presumably is PER TASK. # this .sh always runs once, but the srun command (for parallel slurm tasks) gets called twice (ntasks). Lightning then hooks onto that by detecting it is in a slurm environment. #SBATCH --cpus-per-task=4 @@ -10,7 +10,7 @@ # https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_1.html#setup-the-training-script # https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html -#### SBATCH --ntasks-per-node=1 + nvidia-smi @@ -29,5 +29,5 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t echo 'Running script' REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 4 +srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 4 --nodes 2 From 5c77a4a007f2c23bfd91c81102e1eb60515d63b3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 19 Nov 2023 17:42:49 -0500 Subject: [PATCH 114/307] Multi-node works. Clean up. --- only_for_me/narval/gpu_split.sh | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh index d4f318d4..ab99a982 100644 --- a/only_for_me/narval/gpu_split.sh +++ b/only_for_me/narval/gpu_split.sh @@ -1,11 +1,14 @@ #!/bin/bash #SBATCH --time=0:10:0 #SBATCH --nodes=2 # This needs to match Trainer(num_nodes=...) -#SBATCH --ntasks-per-node=4 # This needs to match Trainer(devices=...). Presumably is PER TASK. -# this .sh always runs once, but the srun command (for parallel slurm tasks) gets called twice (ntasks). Lightning then hooks onto that by detecting it is in a slurm environment. +#SBATCH --ntasks-per-node=4 # This needs to match Trainer(devices=...). This is PER TASK. +# srun is slurm's way to start many jobs from the same sbatch script +# the sbatch script runs *once* and then the srun command is called ntasks-per-node times on each node +# Lightning knows via env variables that it is running on slurm and identifies which DDP instance it should spin up +# webdatasets then reads from lighting with LOCAL_RANK worker we're on and loads the appropriate data #SBATCH --cpus-per-task=4 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 # This needs to match Trainer(devices=...). Presumably is PER TASK. Total GPU = nodes*devices +#SBATCH --gres=gpu:v100:4 # This needs to match Trainer(devices=...). This is PER TASK. Total GPU = nodes*devices # https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_1.html#setup-the-training-script # https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html @@ -15,19 +18,13 @@ nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python -# source ~/envs/zoobot39_dev/bin/activate # mkdir $SLURM_TMPDIR/cache # mkdir /tmp/cache -export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. +export NCCL_BLOCKING_WAIT=1 # "Set this environment variable if you wish to use the NCCL backend for inter-GPU communication." # instructed by Compute Canada, not lightning -# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. -# echo "r$SLURM_NODEID master: $MASTER_ADDR" -# echo "r$SLURM_NODEID Launching python script" - echo 'Running script' REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 4 --nodes 2 - From 9d9c991c7dbeb06e74d03146f79066a2ea1eeb2b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 10:59:00 -0500 Subject: [PATCH 115/307] prepare to update dependencies --- README.md | 10 +++--- benchmarks/pytorch/run_benchmarks.sh | 17 ++++----- setup.py | 54 ++++++++++++++++------------ 3 files changed, 46 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 8bcdc1f5..93430ad8 100755 --- a/README.md +++ b/README.md @@ -32,16 +32,16 @@ Download the code using git: And then pick one of the three commands below to install Zoobot and either PyTorch (recommended) or TensorFlow: - # Zoobot with PyTorch and a GPU. Requires CUDA 11.3. - pip install -e "zoobot[pytorch_cu113]" --extra-index-url https://download.pytorch.org/whl/cu113 + # Zoobot with PyTorch and a GPU. Requires CUDA 12.1 (or CUDA 11.8, if you use `_cu118` instead) + pip install -e "zoobot[pytorch-cu121]" --extra-index-url https://download.pytorch.org/whl/cu121 # OR Zoobot with PyTorch and no GPU - pip install -e "zoobot[pytorch_cpu]" --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e "zoobot[pytorch-cpu]" --extra-index-url https://download.pytorch.org/whl/cpu # OR Zoobot with PyTorch on Mac with M1 chip - pip install -e "zoobot[pytorch_m1]" + pip install -e "zoobot[pytorch-m1]" - # OR Zoobot with TensorFlow. Works with and without a GPU, but if you have a GPU, you need CUDA 11.2. + # OR Zoobot with TensorFlow. Works with and without a GPU, but if you have a GPU, you need CUDA 11.2. pip install -e "zoobot[tensorflow] This installs the downloaded Zoobot code using pip [editable mode](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) so you can easily change the code locally. Zoobot is also available directly from pip (`pip install zoobot[option]`). Only use this if you are sure you won't be making changes to Zoobot itself. For Google Colab, use `pip install zoobot[pytorch_colab]` diff --git a/benchmarks/pytorch/run_benchmarks.sh b/benchmarks/pytorch/run_benchmarks.sh index 07094601..3ff5e946 100755 --- a/benchmarks/pytorch/run_benchmarks.sh +++ b/benchmarks/pytorch/run_benchmarks.sh @@ -13,11 +13,11 @@ SEED=$RANDOM # GZ Evo i.e. all galaxies -# effnet, greyscale and color -# sbatch --job-name=evo_py_gr_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB -# sbatch --job-name=evo_py_gr_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB -# sbatch --job-name=evo_py_co_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB -# sbatch --job-name=evo_py_co_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=128,RESIZE_AFTER_CROP=300,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB +# effnet, greyscale and color, 224 and 300px +sbatch --job-name=evo_py_gr_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +sbatch --job-name=evo_py_gr_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +sbatch --job-name=evo_py_co_eff_224_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB +sbatch --job-name=evo_py_co_eff_300_$SEED --export=ARCHITECTURE=efficientnet_b0,BATCH_SIZE=128,RESIZE_AFTER_CROP=300,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB # and resnet18 # sbatch --job-name=evo_py_gr_res18_224_$SEED --export=ARCHITECTURE=resnet18,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB @@ -26,7 +26,7 @@ SEED=$RANDOM # sbatch --job-name=evo_py_gr_res50_224_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # sbatch --job-name=evo_py_gr_res50_300_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=300,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # color 224 version -sbatch --job-name=evo_py_co_res50_224_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB +# sbatch --job-name=evo_py_co_res50_224_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and with max-vit tiny because hey transformers are cool # smaller batch size due to memory @@ -35,11 +35,12 @@ sbatch --job-name=evo_py_co_res50_224_$SEED --export=ARCHITECTURE=resnet50,BATCH # and max-vit small (works badly) # sbatch --job-name=evo_py_gr_vitsmall_224_$SEED --export=ARCHITECTURE=maxvit_small_224,BATCH_SIZE=64,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB -# and convnext (works badly) +# and convnext (works badly, would really like to try again but bigger) # sbatch --job-name=evo_py_gr_$SEED --export=ARCHITECTURE=convnext_nano,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB # and vit # sbatch --job-name=evo_py_gr_vittinyp16_224_$SEED --export=ARCHITECTURE=vit_tiny_patch16_224,BATCH_SIZE=128,RESIZE_AFTER_CROP=224,DATASET=gz_evo,MIXED_PRECISION_STRING=--mixed-precision,GPUS=2,SEED=$SEED $TRAIN_JOB - +# and swinv2 +# TODO # and in color with no mixed precision, for specific project # sbatch --job-name=evo_py_co_res50_224_fullprec_$SEED --export=ARCHITECTURE=resnet50,BATCH_SIZE=256,RESIZE_AFTER_CROP=224,DATASET=gz_evo,COLOR_STRING=--color,GPUS=2,SEED=$SEED $TRAIN_JOB diff --git a/setup.py b/setup.py index 0da18e1b..712233e5 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="zoobot", - version="1.0.5", + version="1.0.6", author="Mike Walmsley", author_email="walmsleymk1@gmail.com", description="Galaxy morphology classifiers", @@ -22,51 +22,61 @@ packages=setuptools.find_packages(), python_requires=">=3.8", # recommend 3.9 for new users. TF needs >=3.7.2, torchvision>=3.8 extras_require={ - 'pytorch_cpu': [ + 'pytorch-cpu': [ # A100 GPU currently only seems to support cuda 11.3 on manchester cluster, let's stick with this version for now # very latest version wants cuda 11.6 - 'torch == 1.12.1+cpu', - 'torchvision == 0.13.1+cpu', - 'torchaudio == 0.12.1', + 'torch == 2.1.0+cpu', + 'torchvision == 0.16.0+cpu', + 'torchaudio >= 2.1.0', 'pytorch-lightning >= 2.0.0', # 'simplejpeg', 'albumentations', - 'pyro-ppl == 1.8.0', + 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', - 'timm == 0.6.12' + 'timm == 0.9.10' ], - 'pytorch_m1': [ + 'pytorch-m1': [ # as above but without the +cpu (and the extra-index-url in readme has no effect) # all matching pytorch versions for an m1 system will be cpu - 'torch == 1.12.1', - 'torchvision == 0.13.1', - 'torchaudio == 0.12.1', + 'torch == 2.1.0', + 'torchvision == 0.16.0', + 'torchaudio >= 2.1.0', 'pytorch-lightning >= 2.0.0', 'albumentations', - 'pyro-ppl == 1.8.0', + 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', - 'timm == 0.6.12' + 'timm >= 0.9.10' ], # as above but without pytorch itself # for GPU, you will also need e.g. cudatoolkit=11.3, 11.6 # https://pytorch.org/get-started/previous-versions/#v1121 - 'pytorch_cu113': [ - 'torch == 1.12.1+cu113', - 'torchvision == 0.13.1+cu113', - 'torchaudio == 0.12.1', + 'pytorch-cu118': [ + 'torch == 2.1.0+cu118', + 'torchvision == 0.16.0+cu118', + 'torchaudio >= 2.1.0', 'pytorch-lightning >= 2.0.0', 'albumentations', - 'pyro-ppl == 1.8.0', + 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', - 'timm == 0.6.12' - ], - 'pytorch_colab': [ + 'timm >= 0.9.10' + ], # exactly as above, but _cu121 for cuda 12.1 (the current default) + 'pytorch-cu121': [ + 'torch == 2.1.0+cu121', + 'torchvision == 0.16.0+cu121', + 'torchaudio >= 2.1.0', + 'pytorch-lightning >= 2.0.0', + 'albumentations', + 'pyro-ppl >= 1.8.6', + 'torchmetrics == 0.11.0', + 'timm >= 0.9.10' + ], + 'pytorch-colab': [ # colab includes pytorch already 'pytorch-lightning >= 2.0.0', 'albumentations', 'pyro-ppl>=1.8.0', 'torchmetrics==0.11.0', - 'timm == 0.6.12' + 'timm == 0.9.10' ], # TODO may add narval/Digital Research Canada config 'tensorflow': [ From 42e18f27df48967c71c5200fe4f1dd57542f1ba8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 15:42:38 -0500 Subject: [PATCH 116/307] make desi labelled webdatasets --- only_for_me/narval/make_webdataset_script.py | 136 +++++++++++++++++-- only_for_me/narval/train.py | 8 +- only_for_me/narval/train.sh | 10 +- zoobot/pytorch/datasets/webdataset_utils.py | 6 +- 4 files changed, 133 insertions(+), 27 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index ad6cb428..b1f5887f 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -1,43 +1,149 @@ import logging -from itertools import islice -import glob +import numpy as np +import pandas as pd +from astropy import units as u +from astropy.coordinates import SkyCoord from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets from galaxy_datasets.shared import label_metadata from galaxy_datasets import gz2 +from sklearn.model_selection import train_test_split from zoobot.pytorch.datasets import webdataset_utils +def dataset_to_webdataset(dataset_name, dataset_func, label_cols, divisor=4096): + + train_catalog, _ = dataset_func(root=f'/home/walml/data/galaxy-datasets/{dataset_name}', download=True, train=True) + test_catalog, _ = dataset_func(root=f'/home/walml/data/galaxy-datasets/{dataset_name}', download=False, train=False) + + catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=divisor) + + +def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=4096): + for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: + n_shards = len(catalog) // divisor + logging.info(n_shards) + + catalog = catalog[:n_shards*divisor] + logging.info(len(catalog)) + + save_loc = f"/home/walml/data/wds/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically + + webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards) + + # webdataset_utils.load_wds_directly(save_loc) + + # webdataset_utils.load_wds_with_augmentation(save_loc) + + # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) + + def main(): - logging.basicConfig(level=logging.INFO) + # for converting my galaxy-dataset datasets + # dataset_name = 'gz2' + # dataset_func = gz2 + # label_cols = label_metadata.gz2_ortho_label_cols + # dataset_to_webdataset(dataset_name, label_cols, dataset_func) + + + + # for converting other catalogs e.g. DESI + dataset_name = 'desi_labelled' + label_cols = label_metadata.decals_all_campaigns_ortho_label_cols + columns = [ + 'dr8_id', 'brickid', 'objid', 'ra', 'dec' + ] + df = pd.read_parquet('/home/walml/repos/decals-rings/data/master_all_file_index_passes_file_checks.parquet', columns=columns) + # desi pipeline shreds sources. Be careful to deduplicate. + + columns = ['id_str'] + label_cols + votes = pd.concat([ + pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) + for campaign in ['dr12', 'dr5', 'dr8'] + ], axis=0) + assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() + votes['dr8_id'] = votes['id_str'] + df = pd.merge(df, votes[['dr8_id']], on='dr8_id', how='inner') + + df['relative_file_loc'] = df.apply(lambda x: f"{x['brickid']}/{x['brickid']}_{x['objid']}.jpg", axis=1) + df['file_loc'] = '/home/walml/data/desi/jpg/' + df['relative_file_loc'] + + df_dedup = remove_close_sky_matches(df) + print(len(df_dedup)) + # df_dedup2 = remove_close_sky_matches(df_dedup) + # print(len(df_dedup2)) + df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + + df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - train_catalog, _ = gz2(root='/home/walml/repos/zoobot/only_for_me/narval/temp', download=True, train=True) + # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] - divisor = 4096 - n_shards = len(train_catalog) // divisor - logging.info(n_shards) + df_dedup_with_votes = pd.merge(df_dedup, votes, how='inner', on='dr8_id') - train_catalog = train_catalog[:n_shards*divisor] - logging.info(len(train_catalog)) - label_cols = label_metadata.gz2_ortho_label_cols + train_catalog, test_catalog = train_test_split(df_dedup_with_votes, test_size=0.2, random_state=42) + train_catalog.to_parquet('/home/walml/data/wds/desi_labelled/train_catalog_v1.parquet', index=False) + test_catalog.to_parquet('/home/walml/data/wds/desi_labelled/test_catalog_v1.parquet', index=False) + + catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=4096) + + - save_loc = "/home/walml/repos/zoobot/only_for_me/narval/gz2/gz2_train.tar" - webdataset_utils.df_to_wds(train_catalog, label_cols, save_loc, n_shards=n_shards) - # webdataset_utils.load_wds_directly(save_loc) +def remove_close_sky_matches(df, seplimit=20*u.arcsec, col_to_prioritise='ra'): + + catalog = SkyCoord(ra=df['ra'].values * u.deg, dec=df['dec'].values * u.deg) + + search_coords = catalog + + idxc, idxcatalog, d2d, _ = catalog.search_around_sky(search_coords, seplimit=seplimit) + # idxc is index in search coords + # idxcatalog is index in catalog + # steps through all indexes in both that are within seplimit + # d2d gives the distance (not used here) + + # includes self-match, so remove these + idxc = idxc[d2d > 0] + idxcatalog = idxcatalog[d2d > 0] + d2d = d2d[d2d > 0] + + indices_to_drop = [] + for search_index_val in pd.unique(idxc): + matched_indices = idxcatalog[idxc == search_index_val] + matched_indices_including_self = matched_indices.tolist() + [search_index_val] + + # use RA as tiebreaker + matching_galaxies = df.iloc[matched_indices_including_self] + highest = matching_galaxies.index[np.argmax(matching_galaxies[col_to_prioritise])] + these_indices_to_drop = list(set(matched_indices_including_self) - set([highest])) + indices_to_drop += these_indices_to_drop + + indices_to_drop = set(indices_to_drop) + all_indices = np.arange(len(df)) # index is like this, for sure + indices_to_keep = set(all_indices) - indices_to_drop + df_dedup = df.iloc[list(indices_to_keep)] + return df_dedup + - # webdataset_utils.load_wds_with_augmentation(save_loc) - webdataset_utils.load_wds_with_webdatamodule(save_loc, label_cols) if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() + + + + + # df = df[:100000] + # df['total_votes'] = df['smooth-or-featured-dr12_total-votes'] + df['smooth-or-featured-dr5_total-votes'] + df['smooth-or-featured-dr8_total-votes'] + # df['total_votes'] = df['total_votes'].fillna(0) + # df['random'] = np.random.rand(len(df)) \ No newline at end of file diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 65ce44c0..105b6155 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -62,12 +62,12 @@ search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/gz_decals_5/full/gz_decals_5_train_*.tar' + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled/desi_labelled_train_*.tar' all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str - train_urls, val_urls = all_urls[:38], all_urls[38:] - schema = schemas.decals_dr5_ortho_schema + train_urls, val_urls = all_urls[:8], all_urls[8:] + schema = schemas.decals_all_campaigns_ortho_schema # debug mode if args.debug: @@ -107,7 +107,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=9, + num_workers=8, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 4525aaa7..9cd30663 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,11 +1,10 @@ #!/bin/bash #SBATCH --time=1:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=10 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:a100:4 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -23,7 +22,8 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ - --batch-size 256 \ + --batch-size 128 \ + --gpus 1 --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index a4a22cc8..7ee46bbf 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -37,12 +37,12 @@ def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: in -def df_to_wds(df: pd.DataFrame, label_cols, save_loc, n_shards): +def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int): + assert '.tar' in save_loc df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') shard_dfs = np.array_split(df, n_shards) - logging.info('shards: ', len(shard_dfs)) - logging.info('shard size: ', len(shard_dfs[0])) + logging.info(f'shards: {len(shard_dfs)}. Shard size: {len(shard_dfs[0])}') for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') logging.info(shard_save_loc) From dba85dca14799b4c77da125c88b650e38b993123 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 16:16:53 -0500 Subject: [PATCH 117/307] typo --- only_for_me/narval/train.py | 3 ++- only_for_me/narval/train.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 105b6155..5993af4b 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -66,7 +66,7 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str - train_urls, val_urls = all_urls[:8], all_urls[8:] + train_urls, val_urls = all_urls[:25], all_urls[25:] schema = schemas.decals_all_campaigns_ortho_schema # debug mode @@ -99,6 +99,7 @@ epochs=epochs, # rely on early stopping patience=10, # augmentation parameters + # color=args.color, color=args.color, resize_after_crop=args.resize_after_crop, # hardware parameters diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 9cd30663..3a52382f 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -23,7 +23,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 128 \ - --gpus 1 + --gpus 1 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From f93a47f813c5be0cded8fc041049c874947c2832 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 16:23:39 -0500 Subject: [PATCH 118/307] runs, 2xv100 next --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 5993af4b..423f1501 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -108,7 +108,7 @@ mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, prefetch_factor=6, - num_workers=8, + num_workers=5, random_state=random_state, learning_rate=1e-3, # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 3a52382f..1ffa2584 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=1:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=8 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:2 nvidia-smi From abd1929355725912fbf068f36f361dca7f5d5330 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 16:29:33 -0500 Subject: [PATCH 119/307] typo --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1ffa2584..2a536a59 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -23,7 +23,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 128 \ - --gpus 1 \ + --gpus 2 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 11791caa15e1f57ecf417139063a57b2a9adf664 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 17:30:16 -0500 Subject: [PATCH 120/307] 4 gpu --- only_for_me/narval/train.py | 6 +++--- only_for_me/narval/train.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 423f1501..f5dfcf50 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -66,7 +66,7 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str - train_urls, val_urls = all_urls[:25], all_urls[25:] + train_urls, val_urls = all_urls[:70], all_urls[70:] schema = schemas.decals_all_campaigns_ortho_schema # debug mode @@ -75,9 +75,9 @@ 'Using debug mode: cutting urls down to 2') train_urls = train_urls[:2] val_urls = val_urls[:2] - epochs = 2 - else: epochs = 1 + else: + epochs = 1000 if args.wandb: wandb_logger = WandbLogger( diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 2a536a59..925fe1c3 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,18 +1,18 @@ #!/bin/bash #SBATCH --time=1:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:4 nvidia-smi PYTHON=/home/walml/envs/zoobot39_dev/bin/python # source ~/envs/zoobot39_dev/bin/activate -# mkdir $SLURM_TMPDIR/cache -mkdir /tmp/cache +mkdir $SLURM_TMPDIR/cache +# mkdir /tmp/cache export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. # export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. @@ -23,7 +23,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 128 \ - --gpus 2 \ + --gpus 4 \ --color --wandb --mixed-precision # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 875b25c0078258857be56f9b796d3ff9b4372dc6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 17:33:48 -0500 Subject: [PATCH 121/307] typo --- only_for_me/narval/train.py | 4 ++-- only_for_me/narval/train.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index f5dfcf50..90529c5b 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -111,8 +111,8 @@ num_workers=5, random_state=random_state, learning_rate=1e-3, - # cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' - cache_dir='/tmp/cache' + cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 925fe1c3..abe09b12 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=1:00:0 +#SBATCH --time=4:00:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 From 0212fcd7e6ccb49d25c790bc3ab44c5f60f338b3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 22:00:35 -0500 Subject: [PATCH 122/307] hangs during val metrics. Try disable wandb --- only_for_me/narval/train.py | 3 ++- only_for_me/narval/train.sh | 4 +++- zoobot/pytorch/training/train_with_pytorch_lightning.py | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 90529c5b..61708055 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -66,7 +66,8 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str - train_urls, val_urls = all_urls[:70], all_urls[70:] + # train_urls, val_urls = all_urls[:70], all_urls[70:] + train_urls, val_urls = all_urls[:10], all_urls[10:14] schema = schemas.decals_all_campaigns_ortho_schema # debug mode diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index abe09b12..c4201f7b 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -24,7 +24,9 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models \ --batch-size 128 \ --gpus 4 \ - --color --wandb --mixed-precision + --color --mixed-precision + + # --wandb # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py # --architecture maxvit_small_tf_224 \ diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 8c6a973e..f15acb19 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -189,6 +189,9 @@ def train_default_zoobot_from_scratch( 'prefetch_factor': prefetch_factor, 'framework': 'pytorch' }) + else: + logging.warning('No wandb_logger passed. Disabling logging') + wandb_logger = False # work out what dataset the user has passed single_catalog = catalog is not None From a038550b300befac5fec8b535ccb159e5a70ec9f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 22:22:53 -0500 Subject: [PATCH 123/307] csv logger --- only_for_me/narval/train.sh | 2 +- zoobot/pytorch/training/train_with_pytorch_lightning.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c4201f7b..d91e0094 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/debug_models \ + --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ --batch-size 128 \ --gpus 4 \ --color --mixed-precision diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index f15acb19..8e8ad050 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -7,6 +7,7 @@ from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.loggers import CSVLogger from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule @@ -190,8 +191,8 @@ def train_default_zoobot_from_scratch( 'framework': 'pytorch' }) else: - logging.warning('No wandb_logger passed. Disabling logging') - wandb_logger = False + logging.warning('No wandb_logger passed. Using CSV logging only') + wandb_logger = CSVLogger(save_dir=save_dir) # work out what dataset the user has passed single_catalog = catalog is not None From 3147b93eded8bdfcd024b787f9ca91c10466e2bc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 22:33:13 -0500 Subject: [PATCH 124/307] wandb again but rank 0 only logging --- only_for_me/narval/train.sh | 4 ++-- zoobot/pytorch/estimators/define_model.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index d91e0094..fbd93816 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -24,9 +24,9 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ --batch-size 128 \ --gpus 4 \ - --color --mixed-precision + --color --wandb --mixed-precision - # --wandb + # # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py # --architecture maxvit_small_tf_224 \ diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 3b9c238d..39926543 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -239,15 +239,15 @@ def configure_optimizers(self): def log_outputs(self, outputs, step_name): - self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, sync_dist=True) + self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) # if self.log_on_step: # # seperate call to allow for different name, to allow for consistency with TF.keras auto-names # self.log( - # "{}/step_loss".format(step_name), outputs['loss'], on_epoch=False, on_step=True, prog_bar=True, logger=True, sync_dist=True) + # "{}/step_loss".format(step_name), outputs['loss'], on_epoch=False, on_step=True, prog_bar=True, logger=True, rank_zero_only=True) if outputs['predictions'].shape[1] == 2: # will only do for binary classifications # logging.info(predictions.shape, labels.shape) self.log( - "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, sync_dist=True) + "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) def log_loss_per_question(self, multiq_loss, prefix): @@ -256,7 +256,7 @@ def log_loss_per_question(self, multiq_loss, prefix): # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) + self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) From dee12072c48dd8694f912ede18c3e5844342d994 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 22:46:37 -0500 Subject: [PATCH 125/307] it's not wandb. Try disable all self.log --- zoobot/pytorch/estimators/define_model.py | 17 +++++++---------- .../training/train_with_pytorch_lightning.py | 14 +++++++------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 39926543..9d4fda9b 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -239,15 +239,11 @@ def configure_optimizers(self): def log_outputs(self, outputs, step_name): - self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) - # if self.log_on_step: - # # seperate call to allow for different name, to allow for consistency with TF.keras auto-names + # self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) + # if outputs['predictions'].shape[1] == 2: # will only do for binary classifications # self.log( - # "{}/step_loss".format(step_name), outputs['loss'], on_epoch=False, on_step=True, prog_bar=True, logger=True, rank_zero_only=True) - if outputs['predictions'].shape[1] == 2: # will only do for binary classifications - # logging.info(predictions.shape, labels.shape) - self.log( - "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + # "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + pass def log_loss_per_question(self, multiq_loss, prefix): @@ -255,8 +251,9 @@ def log_loss_per_question(self, multiq_loss, prefix): # TODO need schema attribute or similar to have access to question names, this will do for now # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop - for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + # for question_n in range(multiq_loss.shape[1]): + # self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + pass diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 8e8ad050..33e724a2 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -275,9 +275,9 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) - early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) - - callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks + # early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) + # , early_stopping_callback + callbacks = [checkpoint_callback] + extra_callbacks trainer = pl.Trainer( log_every_n_steps=150, # at batch 512 (A100 MP max), DR5 has ~161 train steps @@ -290,12 +290,12 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins, - use_distributed_sampler=use_distributed_sampler + plugins=plugins + # use_distributed_sampler=use_distributed_sampler ) - logging.info((trainer.strategy, trainer.world_size, - trainer.local_rank, trainer.global_rank, trainer.node_rank)) + # logging.info((trainer.strategy, trainer.world_size, + # trainer.local_rank, trainer.global_rank, trainer.node_rank)) trainer.fit(lightning_model, datamodule) # uses batch size of datamodule From cb890a90ce169a2ba6ca18baf42326d550c5f8cd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 22:53:38 -0500 Subject: [PATCH 126/307] disable checkpointing callback --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 33e724a2..5cb29cec 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -277,7 +277,8 @@ def train_default_zoobot_from_scratch( # early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) # , early_stopping_callback - callbacks = [checkpoint_callback] + extra_callbacks + # callbacks = [checkpoint_callback] + extra_callbacks + callbacks = None trainer = pl.Trainer( log_every_n_steps=150, # at batch 512 (A100 MP max), DR5 has ~161 train steps From 90888a8ae8f31b3b2fb421c9d8b208f8dc8e9749 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:09:54 -0500 Subject: [PATCH 127/307] limit batches, num_workers=1 --- only_for_me/narval/train.py | 5 +++-- zoobot/pytorch/training/train_with_pytorch_lightning.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 61708055..40986e9d 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -32,6 +32,7 @@ default=256, type=int) parser.add_argument('--gpus', dest='gpus', default=1, type=int) parser.add_argument('--nodes', dest='nodes', default=1, type=int) + parser.add_argument('--num_workers', dest='num_workers', default=1, type=int) parser.add_argument('--mixed-precision', dest='mixed_precision', default=False, action='store_true') parser.add_argument('--debug', dest='debug', @@ -108,8 +109,8 @@ nodes=args.nodes, mixed_precision=args.mixed_precision, wandb_logger=wandb_logger, - prefetch_factor=6, - num_workers=5, + prefetch_factor=1, # TODO + num_workers=args.num_workers, random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 5cb29cec..e41295f3 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -291,7 +291,9 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins + plugins=plugins, + limit_train_batches=1, + limit_val_batches=1 # use_distributed_sampler=use_distributed_sampler ) From bd1d0198c92f8784530a10b591306efd436b881c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:16:12 -0500 Subject: [PATCH 128/307] works with limit_batches and num_workers=1. likely caused by unequal numbers of validation batches remove limit_batches, see if it breaks --- zoobot/pytorch/estimators/define_model.py | 16 ++++++++-------- .../training/train_with_pytorch_lightning.py | 8 +++++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 9d4fda9b..d065dbdf 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -239,11 +239,11 @@ def configure_optimizers(self): def log_outputs(self, outputs, step_name): - # self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) - # if outputs['predictions'].shape[1] == 2: # will only do for binary classifications - # self.log( - # "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) - pass + self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) + if outputs['predictions'].shape[1] == 2: # will only do for binary classifications + self.log( + "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + # pass def log_loss_per_question(self, multiq_loss, prefix): @@ -251,9 +251,9 @@ def log_loss_per_question(self, multiq_loss, prefix): # TODO need schema attribute or similar to have access to question names, this will do for now # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop - # for question_n in range(multiq_loss.shape[1]): - # self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) - pass + for question_n in range(multiq_loss.shape[1]): + self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + # pass diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index e41295f3..d5aa296c 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -275,6 +275,7 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) + # TODO # early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) # , early_stopping_callback # callbacks = [checkpoint_callback] + extra_callbacks @@ -291,9 +292,10 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins, - limit_train_batches=1, - limit_val_batches=1 + plugins=plugins + # , + # limit_train_batches=1, + # limit_val_batches=1 # use_distributed_sampler=use_distributed_sampler ) From 742f1cc7afb954763d71c32ed02ddb57301a815e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:22:32 -0500 Subject: [PATCH 129/307] broken with num_workers =1 and without train/val batch limit restore num_workers=5 and val_batch limit theory is it's the val batches being unequal --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 1 + zoobot/pytorch/training/train_with_pytorch_lightning.py | 5 ++--- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 40986e9d..60e2393a 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -32,7 +32,7 @@ default=256, type=int) parser.add_argument('--gpus', dest='gpus', default=1, type=int) parser.add_argument('--nodes', dest='nodes', default=1, type=int) - parser.add_argument('--num_workers', dest='num_workers', default=1, type=int) + parser.add_argument('--num-workers', dest='num_workers', default=1, type=int) parser.add_argument('--mixed-precision', dest='mixed_precision', default=False, action='store_true') parser.add_argument('--debug', dest='debug', diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index fbd93816..5dab3ca7 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -24,6 +24,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ --batch-size 128 \ --gpus 4 \ + --num-workers 5 \ --color --wandb --mixed-precision # diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index d5aa296c..c5a1b2ad 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -292,10 +292,9 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins - # , + plugins=plugins, # limit_train_batches=1, - # limit_val_batches=1 + limit_val_batches=1 # use_distributed_sampler=use_distributed_sampler ) From 15ed1bac5802820a47632fd419b419f9533818b9 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:32:23 -0500 Subject: [PATCH 130/307] try with 2 gpu, 5 dataloader per, 10 chunks in train/val exactly equal split is possible --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 60e2393a..2e048da6 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -68,7 +68,7 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str # train_urls, val_urls = all_urls[:70], all_urls[70:] - train_urls, val_urls = all_urls[:10], all_urls[10:14] + train_urls, val_urls = all_urls[:10], all_urls[10:20] schema = schemas.decals_all_campaigns_ortho_schema # debug mode diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 5dab3ca7..6f7fdca4 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=4:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=8 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:2 nvidia-smi @@ -23,7 +23,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ --batch-size 128 \ - --gpus 4 \ + --gpus 2 \ --num-workers 5 \ --color --wandb --mixed-precision From b92935a9eb8b7a7971c292d8a0b98ca271ea33f9 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:50:13 -0500 Subject: [PATCH 131/307] it runs with precisely split dataloaders and limit_val=1 try without limit_val=1 and a larger (but still exactly equal) split --- only_for_me/narval/train.py | 2 +- only_for_me/narval/train.sh | 2 +- zoobot/pytorch/training/train_with_pytorch_lightning.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 2e048da6..2c59aec1 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -68,7 +68,7 @@ all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str # train_urls, val_urls = all_urls[:70], all_urls[70:] - train_urls, val_urls = all_urls[:10], all_urls[10:20] + train_urls, val_urls = all_urls[:60], all_urls[60:70] schema = schemas.decals_all_campaigns_ortho_schema # debug mode diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 6f7fdca4..39c9cdae 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -22,7 +22,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ - --batch-size 128 \ + --batch-size 256 \ --gpus 2 \ --num-workers 5 \ --color --wandb --mixed-precision diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index c5a1b2ad..d5aa296c 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -292,9 +292,10 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins, + plugins=plugins + # , # limit_train_batches=1, - limit_val_batches=1 + # limit_val_batches=1 # use_distributed_sampler=use_distributed_sampler ) From a823db636baaf85923ea91007ebb05b7ea5148c8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:53:44 -0500 Subject: [PATCH 132/307] add callbacks back --- .../pytorch/training/train_with_pytorch_lightning.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index d5aa296c..a40e935b 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -275,14 +275,12 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) - # TODO - # early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) - # , early_stopping_callback - # callbacks = [checkpoint_callback] + extra_callbacks - callbacks = None + early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) + callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks + # callbacks = None trainer = pl.Trainer( - log_every_n_steps=150, # at batch 512 (A100 MP max), DR5 has ~161 train steps + log_every_n_steps=150, accelerator=accelerator, devices=devices, # per node num_nodes=nodes, From 01bbb43f92f57eacdb5cfdaa5be0b0e4786630c6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 20 Nov 2023 23:58:40 -0500 Subject: [PATCH 133/307] restore sync_dist (since metrics were not the problem) experiment with torch.compile --- only_for_me/narval/train.py | 1 + zoobot/pytorch/estimators/define_model.py | 6 +++--- zoobot/pytorch/training/train_with_pytorch_lightning.py | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 2c59aec1..e78f666b 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -111,6 +111,7 @@ wandb_logger=wandb_logger, prefetch_factor=1, # TODO num_workers=args.num_workers, + compile_model=True, # NEW random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index d065dbdf..6c385f7a 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -239,10 +239,10 @@ def configure_optimizers(self): def log_outputs(self, outputs, step_name): - self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, rank_zero_only=True) + self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, sync_dist=True) if outputs['predictions'].shape[1] == 2: # will only do for binary classifications self.log( - "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, rank_zero_only=True) + "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, sync_dist=True) # pass @@ -252,7 +252,7 @@ def log_loss_per_question(self, multiq_loss, prefix): # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, rank_zero_only=True) + self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) # pass diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index a40e935b..5d99a8a3 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -51,6 +51,7 @@ def train_default_zoobot_from_scratch( num_workers=4, prefetch_factor=4, mixed_precision=False, + compile_model=False, # checkpointing / logging wandb_logger=None, checkpoint_file_template=None, @@ -300,6 +301,10 @@ def train_default_zoobot_from_scratch( # logging.info((trainer.strategy, trainer.world_size, # trainer.local_rank, trainer.global_rank, trainer.node_rank)) + if compile_model: + logging.warning('Using torch.compile on LightningModel') + lightning_model = torch.compile(lightning_model) + trainer.fit(lightning_model, datamodule) # uses batch size of datamodule test_trainer = pl.Trainer( From fead45a571f18d972b4809b0d0d5cdea9e230698 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 00:33:00 -0500 Subject: [PATCH 134/307] training_step not compiling due to batch_idx arg changing try to remove batch_idx at least from my own funcs (but it's built into lightning) also increase cache massively --- only_for_me/narval/train.py | 3 +++ zoobot/pytorch/estimators/define_model.py | 6 +++--- zoobot/pytorch/training/train_with_pytorch_lightning.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index e78f666b..09ab997a 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -119,4 +119,7 @@ # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) + # https://discuss.pytorch.org/t/torch-dynamo-hit-config-cache-size-limit-64/183886 + # https://pytorch.org/docs/stable/torch.compiler_faq.html#why-is-compilation-slow + wandb.finish() \ No newline at end of file diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 6c385f7a..35b0561c 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -83,13 +83,13 @@ def configure_optimizers(self): raise NotImplementedError('Must be subclassed') def training_step(self, batch, batch_idx): - return self.make_step(batch, batch_idx, step_name='train') + return self.make_step(batch, step_name='train') def on_train_batch_end(self, outputs, *args): self.log_outputs(outputs, step_name='train') def validation_step(self, batch, batch_idx): - return self.make_step(batch, batch_idx, step_name='validation') + return self.make_step(batch, step_name='validation') def on_validation_batch_end(self, outputs, *args): self.log_outputs(outputs, step_name='validation') @@ -98,7 +98,7 @@ def log_outputs(self, outputs, step_name): raise NotImplementedError('Must be subclassed') def test_step(self, batch, batch_idx): - return self.make_step(batch, batch_idx, step_name='test') + return self.make_step(batch, step_name='test') def on_test_batch_end(self, outputs, *args): self.log_outputs(outputs, step_name='test') diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 5d99a8a3..a3bf582e 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -15,6 +15,7 @@ from zoobot.pytorch.datasets import webdatamodule + def train_default_zoobot_from_scratch( # absolutely crucial arguments save_dir: str, # save model here @@ -228,7 +229,6 @@ def train_default_zoobot_from_scratch( num_workers=num_workers, prefetch_factor=prefetch_factor ) - use_distributed_sampler=True else: # this branch will use WebDataModule to load premade webdatasets datamodule = webdatamodule.WebDataModule( @@ -240,7 +240,6 @@ def train_default_zoobot_from_scratch( cache_dir=cache_dir # TODO pass through the rest ) - use_distributed_sampler=False datamodule.setup(stage='fit') @@ -303,6 +302,7 @@ def train_default_zoobot_from_scratch( if compile_model: logging.warning('Using torch.compile on LightningModel') + torch._dynamo.config.cache_size_limit = 512 lightning_model = torch.compile(lightning_model) trainer.fit(lightning_model, datamodule) # uses batch size of datamodule From dbeced18a04c412081faf864a7a940cec47867b0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 00:40:25 -0500 Subject: [PATCH 135/307] 60 shards without compile --- only_for_me/narval/train.sh | 2 +- .../pytorch/training/train_with_pytorch_lightning.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 39c9cdae..74b770a0 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/debug_models_v2 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_60_shards \ --batch-size 256 \ --gpus 2 \ --num-workers 5 \ diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index a3bf582e..7b927ce1 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -300,10 +300,12 @@ def train_default_zoobot_from_scratch( # logging.info((trainer.strategy, trainer.world_size, # trainer.local_rank, trainer.global_rank, trainer.node_rank)) - if compile_model: - logging.warning('Using torch.compile on LightningModel') - torch._dynamo.config.cache_size_limit = 512 - lightning_model = torch.compile(lightning_model) + # disabled for now until December, not crucial. Stop over-optimising. + # if compile_model: + # logging.warning('Using torch.compile on LightningModel') + # torch._dynamo.config.cache_size_limit = 512 + # torch._dynamo. + # lightning_model = torch.compile(lightning_model) trainer.fit(lightning_model, datamodule) # uses batch size of datamodule From 2769e360831504771954b15885165c9c76a94554 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 00:40:39 -0500 Subject: [PATCH 136/307] run longer --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 74b770a0..3c7d8a31 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=4:00:0 +#SBATCH --time=12:00:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=8 From 4ebe7fa505c94ccfeb03bd4fcd6439c5fd326f0f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 00:41:00 -0500 Subject: [PATCH 137/307] 23 hours --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 3c7d8a31..57bb147d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=12:00:0 +#SBATCH --time=23:00:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=8 From ccff0a189b422467323aa5befbf74d9c6bd40c21 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 09:38:52 -0500 Subject: [PATCH 138/307] try compile just the encoder --- zoobot/pytorch/estimators/define_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 35b0561c..babb52cb 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -70,7 +70,7 @@ def forward(self, x): x = self.encoder(x) return self.head(x) - def make_step(self, batch, batch_idx, step_name): + def make_step(self, batch, step_name): x, labels = batch predictions = self(x) # by default, these are Dirichlet concentrations loss = self.calculate_and_log_loss(predictions, labels, step_name) @@ -179,12 +179,12 @@ def __init__( self.weight_decay = weight_decay self.scheduler_params = scheduler_params - self.encoder = get_pytorch_encoder( + self.encoder = torch.compile(get_pytorch_encoder( architecture_name, channels, use_imagenet_weights=use_imagenet_weights, **timm_kwargs - ) + )) # bit lazy assuming 224 input size self.encoder_dim = get_encoder_dim(self.encoder, input_size=224, channels=channels) # typically encoder_dim=1280 for effnetb0 From 57dec267a67795393acd2338d58ed06aed6b400c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 10:01:35 -0500 Subject: [PATCH 139/307] compile encoder, works well locally --- only_for_me/narval/gpu_split.py | 24 ++++++++++++------- only_for_me/narval/train.py | 2 +- zoobot/pytorch/estimators/define_model.py | 10 ++++++-- .../training/train_with_pytorch_lightning.py | 12 ++-------- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index db2b5ea6..d73433e9 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -73,11 +73,11 @@ def main(): schema = schemas.decals_all_campaigns_ortho_schema - # shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=32) + shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=256) # exit() # webdataset_utils.load_wds_directly(shards[0], max_to_load=None) # webdataset_utils.load_wds_with_webdatamodule(shards, label_cols=schema.label_cols, max_to_load=None) - shards = list(glob.glob('/home/walml/repos/temp/mock_shard_*_32.tar')) + shards = list(glob.glob('/home/walml/repos/temp/mock_shard_*_256.tar')) # exit() train_shards = shards[:8] @@ -100,20 +100,28 @@ def main(): devices=args.gpus, # per node num_nodes=args.nodes, # strategy='auto', - precision='16-mixed', + precision='32', logger=False, # callbacks=callbacks, - max_epochs=1, + max_epochs=10, default_root_dir=save_dir, # plugins=plugins, # use_distributed_sampler=use_distributed_sampler ) - # logging.info((trainer.strategy, trainer.world_size, - # trainer.local_rank, trainer.global_rank, trainer.node_rank)) - - lightning_model = ToyLightningModule() + # lightning_model = ToyLightningModule() + # lightning_model = torch.compile(lightning_model) + from zoobot.pytorch.estimators import define_model + lightning_model = define_model.ZoobotTree( + output_dim=len(schema.label_cols), + question_index_groups=schema.question_index_groups, + architecture_name="efficientnet_b0", + channels=3, + compile_encoder=True + ) + # lightning_model = torch.compile(lightning_model) + trainer.fit(lightning_model, datamodule) # uses batch size of datamodule # batch size 16 diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 09ab997a..17c2b800 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -111,7 +111,7 @@ wandb_logger=wandb_logger, prefetch_factor=1, # TODO num_workers=args.num_workers, - compile_model=True, # NEW + compile_encoder=True, # NEW random_state=random_state, learning_rate=1e-3, cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index babb52cb..9f26ba89 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -144,6 +144,7 @@ def __init__( channels=1, use_imagenet_weights=False, test_time_dropout=True, + compile_encoder=False, timm_kwargs={}, # passed to timm.create_model e.g. drop_path_rate=0.2 for effnet # head args dropout_rate=0.2, @@ -162,6 +163,7 @@ def __init__( architecture_name, channels, timm_kwargs, + compile_encoder, test_time_dropout, dropout_rate, learning_rate, @@ -179,12 +181,16 @@ def __init__( self.weight_decay = weight_decay self.scheduler_params = scheduler_params - self.encoder = torch.compile(get_pytorch_encoder( + self.encoder = get_pytorch_encoder( architecture_name, channels, use_imagenet_weights=use_imagenet_weights, **timm_kwargs - )) + ) + if compile_encoder: + logging.warning('Using torch.compile on encoder') + self.encoder = torch.compile(self.encoder) + # bit lazy assuming 224 input size self.encoder_dim = get_encoder_dim(self.encoder, input_size=224, channels=channels) # typically encoder_dim=1280 for effnetb0 diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 7b927ce1..3868ebaa 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -52,7 +52,7 @@ def train_default_zoobot_from_scratch( num_workers=4, prefetch_factor=4, mixed_precision=False, - compile_model=False, + compile_encoder=False, # checkpointing / logging wandb_logger=None, checkpoint_file_template=None, @@ -254,6 +254,7 @@ def train_default_zoobot_from_scratch( dropout_rate=dropout_rate, learning_rate=learning_rate, timm_kwargs={'drop_path_rate': drop_connect_rate}, + compile_encoder=compile_encoder, betas=betas, weight_decay=weight_decay, scheduler_params=scheduler_params @@ -297,15 +298,6 @@ def train_default_zoobot_from_scratch( # use_distributed_sampler=use_distributed_sampler ) - # logging.info((trainer.strategy, trainer.world_size, - # trainer.local_rank, trainer.global_rank, trainer.node_rank)) - - # disabled for now until December, not crucial. Stop over-optimising. - # if compile_model: - # logging.warning('Using torch.compile on LightningModel') - # torch._dynamo.config.cache_size_limit = 512 - # torch._dynamo. - # lightning_model = torch.compile(lightning_model) trainer.fit(lightning_model, datamodule) # uses batch size of datamodule From f1cf5c629c8c6e68a84d2624475729b14344f746 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 10:04:08 -0500 Subject: [PATCH 140/307] add notes --- only_for_me/narval/gpu_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py index d73433e9..ff0ab328 100644 --- a/only_for_me/narval/gpu_split.py +++ b/only_for_me/narval/gpu_split.py @@ -118,7 +118,7 @@ def main(): question_index_groups=schema.question_index_groups, architecture_name="efficientnet_b0", channels=3, - compile_encoder=True + compile_encoder=False # with compile on local desktop, 51 seconds for first epoch and 10 seconds thereafter. Without, 12 seconds for all epochs. ) # lightning_model = torch.compile(lightning_model) From e15ed0e0dd282b5afa4d045dfa03a7978164f6cd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 11:01:04 -0500 Subject: [PATCH 141/307] single v100 with 10 cpu/workers --- only_for_me/narval/train.sh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 57bb147d..a317b398 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=2 -#SBATCH --cpus-per-task=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -23,11 +23,10 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_60_shards \ --batch-size 256 \ - --gpus 2 \ - --num-workers 5 \ + --gpus 1 \ + --num-workers 10 \ --color --wandb --mixed-precision - # # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py # --architecture maxvit_small_tf_224 \ From 399d5927e50229d3cb3b8397e5237f87c207600d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 21 Nov 2023 11:18:33 -0500 Subject: [PATCH 142/307] runs great on 1 gpu. Restart with save dir --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a317b398..92fb913b 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,7 +21,7 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_60_shards \ + --save-dir $REPO_DIR/only_for_me/narval/desi_60_shards_1gpu \ --batch-size 256 \ --gpus 1 \ --num-workers 10 \ From 6d17c6e7d0661712d409c7befb341942e4e155bb Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 12:36:04 -0500 Subject: [PATCH 143/307] try train small-dim model, 2 gpu --- only_for_me/narval/make_webdataset_script.py | 2 +- only_for_me/narval/train.py | 8 ++++++-- only_for_me/narval/train.sh | 9 +++++---- zoobot/pytorch/training/train_with_pytorch_lightning.py | 7 ++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index b1f5887f..8ff7a43f 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -89,7 +89,7 @@ def main(): train_catalog.to_parquet('/home/walml/data/wds/desi_labelled/train_catalog_v1.parquet', index=False) test_catalog.to_parquet('/home/walml/data/wds/desi_labelled/test_catalog_v1.parquet', index=False) - catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=4096) + catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 17c2b800..415bf24f 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -30,6 +30,8 @@ parser.add_argument('--color', default=False, action='store_true') parser.add_argument('--batch-size', dest='batch_size', default=256, type=int) + parser.add_argument('--num-features', dest='num_features', + default=1280, type=int) parser.add_argument('--gpus', dest='gpus', default=1, type=int) parser.add_argument('--nodes', dest='nodes', default=1, type=int) parser.add_argument('--num-workers', dest='num_workers', default=1, type=int) @@ -63,12 +65,13 @@ search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled/desi_labelled_train_*.tar' + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_2048/desi_labelled_train_*.tar' all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str # train_urls, val_urls = all_urls[:70], all_urls[70:] - train_urls, val_urls = all_urls[:60], all_urls[60:70] + # train_urls, val_urls = all_urls[:60], all_urls[60:70] + train_urls, val_urls = all_urls[:120], all_urls[120:140] # all num shards must be divisible by workers * gpus e.g. 10*1, 10*2 schema = schemas.decals_all_campaigns_ortho_schema # debug mode @@ -97,6 +100,7 @@ val_urls = val_urls, test_urls = None, architecture_name=args.architecture_name, + timm_kwargs={'drop_path_rate': 0.2, 'num_features': args.num_features}, batch_size=args.batch_size, epochs=epochs, # rely on early stopping patience=10, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 92fb913b..08689882 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:00:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:2 nvidia-smi @@ -21,9 +21,10 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_60_shards_1gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_f128 \ --batch-size 256 \ - --gpus 1 \ + --num-features 128 \ + --gpus 2 \ --num-workers 10 \ --color --wandb --mixed-precision diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 3868ebaa..e46133c1 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -33,10 +33,10 @@ def train_default_zoobot_from_scratch( epochs=1000, patience=8, # model hparams - architecture_name='efficientnet_b0', # recently changed + architecture_name='efficientnet_b0', + timm_kwargs = {}, # e.g. {'drop_path_rate': 0.2, 'num_features': 1280}. Passed to timm model init method, depends on arch. batch_size=128, dropout_rate=0.2, - drop_connect_rate=0.2, learning_rate=1e-3, betas=(0.9, 0.999), weight_decay=0.01, @@ -253,7 +253,8 @@ def train_default_zoobot_from_scratch( test_time_dropout=True, dropout_rate=dropout_rate, learning_rate=learning_rate, - timm_kwargs={'drop_path_rate': drop_connect_rate}, + # https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/efficientnet.py#L75C9-L75C17 + timm_kwargs=timm_kwargs, compile_encoder=compile_encoder, betas=betas, weight_decay=weight_decay, From 032e52a591b90077b82bebd8d80d39198800b03e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 12:38:01 -0500 Subject: [PATCH 144/307] typo --- only_for_me/narval/make_webdataset_script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index 8ff7a43f..67b37e72 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -75,10 +75,12 @@ def main(): df_dedup = remove_close_sky_matches(df) print(len(df_dedup)) + exit() # df_dedup2 = remove_close_sky_matches(df_dedup) # print(len(df_dedup2)) df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] From e1426837be7374d5babcfb82d4de4474c4b3e000 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 13:54:24 -0500 Subject: [PATCH 145/307] pass aug args add timm kwargs option also need to clone timm (temp?) --- only_for_me/narval/make_webdataset_script.py | 20 +++++++----- only_for_me/narval/train.py | 14 ++++++--- only_for_me/narval/train.sh | 2 +- zoobot/pytorch/datasets/webdatamodule.py | 31 +++++++++++++++++-- .../training/train_with_pytorch_lightning.py | 11 +++++-- 5 files changed, 61 insertions(+), 17 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index 67b37e72..efd33e91 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -62,19 +62,23 @@ def main(): # desi pipeline shreds sources. Be careful to deduplicate. columns = ['id_str'] + label_cols - votes = pd.concat([ - pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) - for campaign in ['dr12', 'dr5', 'dr8'] - ], axis=0) - assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() - votes['dr8_id'] = votes['id_str'] - df = pd.merge(df, votes[['dr8_id']], on='dr8_id', how='inner') + # votes = pd.concat([ + # pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) + # for campaign in ['dr12', 'dr5', 'dr8'] + # ], axis=0) + # assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() + # votes['dr8_id'] = votes['id_str'] + + # name = 'labelled' + # merge_strategy = {'labelled': 'inner', 'all': 'left'} + # df = pd.merge(df, votes[['dr8_id']], on='dr8_id', how=merge_strategy[name]) df['relative_file_loc'] = df.apply(lambda x: f"{x['brickid']}/{x['brickid']}_{x['objid']}.jpg", axis=1) df['file_loc'] = '/home/walml/data/desi/jpg/' + df['relative_file_loc'] df_dedup = remove_close_sky_matches(df) print(len(df_dedup)) + df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') exit() # df_dedup2 = remove_close_sky_matches(df_dedup) # print(len(df_dedup2)) @@ -103,6 +107,7 @@ def remove_close_sky_matches(df, seplimit=20*u.arcsec, col_to_prioritise='ra'): search_coords = catalog + logging.info('Beginning search for nearby galaxies') idxc, idxcatalog, d2d, _ = catalog.search_around_sky(search_coords, seplimit=seplimit) # idxc is index in search coords # idxcatalog is index in catalog @@ -114,6 +119,7 @@ def remove_close_sky_matches(df, seplimit=20*u.arcsec, col_to_prioritise='ra'): idxcatalog = idxcatalog[d2d > 0] d2d = d2d[d2d > 0] + logging.info('Beginning drop prioritisation') indices_to_drop = [] for search_index_val in pd.unique(idxc): matched_indices = idxcatalog[idxc == search_index_val] diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 415bf24f..e9de4a0c 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -21,15 +21,16 @@ See zoobot/pytorch/examples/minimal_examples.py for a friendlier example """ parser = argparse.ArgumentParser() - parser.add_argument('--save-dir', dest='save_dir', type=str) + parser.add_argument('--save-dir', dest='save_dir', type=str, default='local_debug') # parser.add_argument('--data-dir', dest='data_dir', type=str) # parser.add_argument('--dataset', dest='dataset', type=str, help='dataset to use, either "gz_decals_dr5" or "gz_evo"') parser.add_argument('--architecture', dest='architecture_name', default='efficientnet_b0', type=str) parser.add_argument('--resize-after-crop', dest='resize_after_crop', type=int, default=224) parser.add_argument('--color', default=False, action='store_true') + parser.add_argument('--compile-encoder', dest='compile_encoder', default=False, action='store_true') parser.add_argument('--batch-size', dest='batch_size', - default=256, type=int) + default=16, type=int) parser.add_argument('--num-features', dest='num_features', default=1280, type=int) parser.add_argument('--gpus', dest='gpus', default=1, type=int) @@ -62,10 +63,13 @@ # logging.info([(x, y) for (x, y) in os.environ.items() if 'SLURM' in x]) if os.path.isdir('/home/walml/repos/zoobot'): - search_str = '/home/walml/repos/zoobot/gz_decals_5_train_*.tar' + logging.warning('local mode') + search_str = '/home/walml/data/wds/desi_labelled_2048/desi_labelled_train_*.tar' + cache_dir = None else: search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_2048/desi_labelled_train_*.tar' + cache_dir = os.environ['SLURM_TMPDIR'] + '/cache' all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str @@ -115,10 +119,10 @@ wandb_logger=wandb_logger, prefetch_factor=1, # TODO num_workers=args.num_workers, - compile_encoder=True, # NEW + compile_encoder=args.compile_encoder, # NEW random_state=random_state, learning_rate=1e-3, - cache_dir=os.environ['SLURM_TMPDIR'] + '/cache' + cache_dir=cache_dir # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 08689882..e4e217d6 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -26,7 +26,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --num-features 128 \ --gpus 2 \ --num-workers 10 \ - --color --wandb --mixed-precision + --color --wandb --mixed-precision --compile-encoder # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index bec1c7f4..8eec8071 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -12,7 +12,23 @@ # https://github.com/webdataset/webdataset-lightning/blob/main/train.py class WebDataModule(pl.LightningDataModule): - def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_cols=None, batch_size=64, num_workers=4, prefetch_factor=4, cache_dir=None): + def __init__( + self, + train_urls, + val_urls, + label_cols=None, + train_size=None, + val_size=None, + # hardware + batch_size=64, + num_workers=4, + prefetch_factor=4, + cache_dir=None, + color=False, + crop_scale_bounds=(0.7, 0.8), + crop_ratio_bounds=(0.9, 1.1), + resize_after_crop=224 + ): super().__init__() # if isinstance(train_urls, types.GeneratorType): @@ -39,6 +55,12 @@ def __init__(self, train_urls, val_urls, train_size=None, val_size=None, label_c self.cache_dir = cache_dir + # could use mixin + self.color = color + self.resize_after_crop = resize_after_crop + self.crop_scale_bounds = crop_scale_bounds + self.crop_ratio_bounds = crop_ratio_bounds + logging.info(f'Creating webdatamodule with WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') @@ -52,7 +74,12 @@ def make_image_transform(self, mode="train"): # if mode == "train": # elif mode == "val": - augmentation_transform = default_transforms() # A.Compose object + augmentation_transform = default_transforms( + crop_scale_bounds=self.crop_scale_bounds, + crop_ratio_bounds=self.crop_ratio_bounds, + resize_after_crop=self.resize_after_crop, + pytorch_greyscale=not self.color + ) # A.Compose object def do_transform(img): return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) return do_transform diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index e46133c1..3f17fd65 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -234,10 +234,17 @@ def train_default_zoobot_from_scratch( datamodule = webdatamodule.WebDataModule( train_urls=train_urls, val_urls=val_urls, + label_cols=schema.label_cols, + # hardware batch_size=batch_size, num_workers=num_workers, - label_cols=schema.label_cols, - cache_dir=cache_dir + prefetch_factor=prefetch_factor, + cache_dir=cache_dir, + # augmentation args + color=color, + crop_scale_bounds=crop_scale_bounds, + crop_ratio_bounds=crop_ratio_bounds, + resize_after_crop=resize_after_crop, # TODO pass through the rest ) From 487f4661e0f42075c7d2af194873675458cd74c6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:01:19 -0500 Subject: [PATCH 146/307] typo --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index e4e217d6..cff5b813 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -23,7 +23,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_f128 \ --batch-size 256 \ - --num-features 128 \ + --num-features 128 \ --gpus 2 \ --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder From 3303a9d7554963e4c5c5a8a626db4a97406fcfe8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:02:18 -0500 Subject: [PATCH 147/307] test priority --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index cff5b813..dda8efba 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash -#SBATCH --time=23:00:0 +#SBATCH --time=0:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:1 nvidia-smi From e9d34320e06f0b3598d08e03f997c2117a5544f1 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:02:28 -0500 Subject: [PATCH 148/307] v2 --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index dda8efba..225191ec 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -24,7 +24,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_f128 \ --batch-size 256 \ --num-features 128 \ - --gpus 2 \ + --gpus 1 \ --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder From 3e27ae25967fe4b1941528b8efc1973b4c019f32 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:08:57 -0500 Subject: [PATCH 149/307] it works, queue for 23h full run --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 225191ec..a29f2b8f 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=0:30:0 +#SBATCH --time=23:30:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 From 52718f7b47571eaafcf4f16ac78c1989fb98e868 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:10:00 -0500 Subject: [PATCH 150/307] try 512 features --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a29f2b8f..80baefa0 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -21,9 +21,9 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_f128 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_f512 \ --batch-size 256 \ - --num-features 128 \ + --num-features 512 \ --gpus 1 \ --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder From c251988c0133433b5f06eef26fd4de5212794575 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 14:16:26 -0500 Subject: [PATCH 151/307] restart 2gpu --- only_for_me/narval/train.sh | 10 +++++----- .../pytorch/training/train_with_pytorch_lightning.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 80baefa0..41e5ded0 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:2 nvidia-smi @@ -21,10 +21,10 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_f512 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_f128_2gpu \ --batch-size 256 \ - --num-features 512 \ - --gpus 1 \ + --num-features 128 \ + --gpus 2 \ --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 3f17fd65..42e63bd0 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -115,6 +115,7 @@ def train_default_zoobot_from_scratch( assert save_dir is not None if not os.path.isdir(save_dir): os.mkdir(save_dir) + logging.info(f'Saving to {save_dir}') if color: logging.warning( From 9df82670d91dd47dbc3001ca743f32d4b18cc80c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 17:04:13 -0500 Subject: [PATCH 152/307] minimal transforms only speed test --- only_for_me/narval/make_webdataset_script.py | 36 ++++++++++---------- only_for_me/narval/train.sh | 6 ++-- zoobot/pytorch/datasets/webdatamodule.py | 16 ++++++--- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index efd33e91..9cb9e824 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -58,42 +58,42 @@ def main(): columns = [ 'dr8_id', 'brickid', 'objid', 'ra', 'dec' ] - df = pd.read_parquet('/home/walml/repos/decals-rings/data/master_all_file_index_passes_file_checks.parquet', columns=columns) + # df = pd.read_parquet('/home/walml/repos/decals-rings/data/master_all_file_index_passes_file_checks.parquet', columns=columns) # desi pipeline shreds sources. Be careful to deduplicate. columns = ['id_str'] + label_cols - # votes = pd.concat([ - # pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) - # for campaign in ['dr12', 'dr5', 'dr8'] - # ], axis=0) - # assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() - # votes['dr8_id'] = votes['id_str'] + votes = pd.concat([ + pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) + for campaign in ['dr12', 'dr5', 'dr8'] + ], axis=0) + assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() + votes['dr8_id'] = votes['id_str'] # name = 'labelled' # merge_strategy = {'labelled': 'inner', 'all': 'left'} # df = pd.merge(df, votes[['dr8_id']], on='dr8_id', how=merge_strategy[name]) - df['relative_file_loc'] = df.apply(lambda x: f"{x['brickid']}/{x['brickid']}_{x['objid']}.jpg", axis=1) - df['file_loc'] = '/home/walml/data/desi/jpg/' + df['relative_file_loc'] + # df['relative_file_loc'] = df.apply(lambda x: f"{x['brickid']}/{x['brickid']}_{x['objid']}.jpg", axis=1) + # df['file_loc'] = '/home/walml/data/desi/jpg/' + df['relative_file_loc'] - df_dedup = remove_close_sky_matches(df) - print(len(df_dedup)) - df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') - exit() + # df_dedup = remove_close_sky_matches(df) + # print(len(df_dedup)) + # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') + # exit() # df_dedup2 = remove_close_sky_matches(df_dedup) # print(len(df_dedup2)) - df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] - df_dedup_with_votes = pd.merge(df_dedup, votes, how='inner', on='dr8_id') + df_dedup_with_votes = pd.merge(df_dedup, votes, how='left', on='dr8_id') train_catalog, test_catalog = train_test_split(df_dedup_with_votes, test_size=0.2, random_state=42) - train_catalog.to_parquet('/home/walml/data/wds/desi_labelled/train_catalog_v1.parquet', index=False) - test_catalog.to_parquet('/home/walml/data/wds/desi_labelled/test_catalog_v1.parquet', index=False) + train_catalog.to_parquet('/home/walml/data/wds/desi_all/train_catalog_v1.parquet', index=False) + test_catalog.to_parquet('/home/walml/data/wds/desi_all/test_catalog_v1.parquet', index=False) catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 41e5ded0..00b6f648 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:2 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -24,7 +24,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_f128_2gpu \ --batch-size 256 \ --num-features 128 \ - --gpus 2 \ + --gpus 1 \ --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 8eec8071..18ad2296 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -8,7 +8,7 @@ import webdataset as wds -from galaxy_datasets.transforms import default_transforms +from galaxy_datasets import transforms # https://github.com/webdataset/webdataset-lightning/blob/main/train.py class WebDataModule(pl.LightningDataModule): @@ -74,12 +74,20 @@ def make_image_transform(self, mode="train"): # if mode == "train": # elif mode == "val": - augmentation_transform = default_transforms( - crop_scale_bounds=self.crop_scale_bounds, - crop_ratio_bounds=self.crop_ratio_bounds, + # augmentation_transform = transforms.default_transforms( + # crop_scale_bounds=self.crop_scale_bounds, + # crop_ratio_bounds=self.crop_ratio_bounds, + # resize_after_crop=self.resize_after_crop, + # pytorch_greyscale=not self.color + # ) # A.Compose object + + logging.warning('Minimal augmentations for speed test') + augmentation_transform = transforms.minimal_transforms( resize_after_crop=self.resize_after_crop, pytorch_greyscale=not self.color ) # A.Compose object + + def do_transform(img): return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) return do_transform From b8d3fc01432bb2c61a30ca97b826f0e3555e3db5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 22 Nov 2023 17:28:49 -0500 Subject: [PATCH 153/307] it is def CPU limited. Add even more CPUs --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 00b6f648..f1ee9ecc 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,7 +2,7 @@ #SBATCH --time=23:30:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=10 +#SBATCH --cpus-per-task=20 #SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:1 @@ -25,7 +25,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --batch-size 256 \ --num-features 128 \ --gpus 1 \ - --num-workers 10 \ + --num-workers 20 \ --color --wandb --mixed-precision --compile-encoder # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py From 1b169ab44f35a7d78b63d342a1961970e26c3f3c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 11:06:42 -0500 Subject: [PATCH 154/307] set up for 300px training runs --- only_for_me/narval/make_webdataset_script.py | 23 ++++---- only_for_me/narval/train.py | 7 ++- only_for_me/narval/train.sh | 6 +- zoobot/pytorch/datasets/webdatamodule.py | 18 +++--- zoobot/pytorch/datasets/webdataset_utils.py | 61 ++++++++++++++++++-- 5 files changed, 86 insertions(+), 29 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index 9cb9e824..2113136a 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -23,7 +23,7 @@ def dataset_to_webdataset(dataset_name, dataset_func, label_cols, divisor=4096): catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=divisor) -def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=4096): +def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=4096): for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: n_shards = len(catalog) // divisor logging.info(n_shards) @@ -33,7 +33,7 @@ def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog save_loc = f"/home/walml/data/wds/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically - webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards) + webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df) # webdataset_utils.load_wds_directly(save_loc) @@ -53,7 +53,8 @@ def main(): # for converting other catalogs e.g. DESI - dataset_name = 'desi_labelled' + dataset_name = 'desi_labelled_300px_2048' + # dataset_name = 'desi_all_2048' label_cols = label_metadata.decals_all_campaigns_ortho_label_cols columns = [ 'dr8_id', 'brickid', 'objid', 'ra', 'dec' @@ -84,18 +85,20 @@ def main(): # print(len(df_dedup2)) # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - - df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') + df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + # df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') + df_dedup['id_str'] = df_dedup['dr8_id'] # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] - df_dedup_with_votes = pd.merge(df_dedup, votes, how='left', on='dr8_id') + # gets too big, need to only merge in label_df per shard + # df_dedup_with_votes = pd.merge(df_dedup, votes, how='left', on='dr8_id') - train_catalog, test_catalog = train_test_split(df_dedup_with_votes, test_size=0.2, random_state=42) - train_catalog.to_parquet('/home/walml/data/wds/desi_all/train_catalog_v1.parquet', index=False) - test_catalog.to_parquet('/home/walml/data/wds/desi_all/test_catalog_v1.parquet', index=False) + train_catalog, test_catalog = train_test_split(df_dedup, test_size=0.2, random_state=42) + train_catalog.to_parquet('/home/walml/data/wds/desi_labelled_300px_2048/train_catalog_v1.parquet', index=False) + test_catalog.to_parquet('/home/walml/data/wds/desi_labelled_300px_2048/test_catalog_v1.parquet', index=False) - catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048) + catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048, sparse_label_df=votes) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index e9de4a0c..5f74f156 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -64,11 +64,11 @@ if os.path.isdir('/home/walml/repos/zoobot'): logging.warning('local mode') - search_str = '/home/walml/data/wds/desi_labelled_2048/desi_labelled_train_*.tar' + search_str = '/home/walml/data/wds/desi_labelled_300px_2048/desi_labelled_train_*.tar' cache_dir = None else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_2048/desi_labelled_train_*.tar' + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_300px_2048/desi_labelled_train_*.tar' cache_dir = os.environ['SLURM_TMPDIR'] + '/cache' all_urls = glob.glob(search_str) @@ -122,7 +122,8 @@ compile_encoder=args.compile_encoder, # NEW random_state=random_state, learning_rate=1e-3, - cache_dir=cache_dir + cache_dir=cache_dir, + crop_scale_bounds=(0.75, 0.85) # slightly increased to compensate for 424-400px crop when saving webdataset # cache_dir='/tmp/cache' # /tmp for ramdisk (400GB total, vs 4TB total for nvme) ) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f1ee9ecc..ec66aab8 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -2,7 +2,7 @@ #SBATCH --time=23:30:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=20 +#SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:1 @@ -21,11 +21,11 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_f128_2gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_f128_1gpu \ --batch-size 256 \ --num-features 128 \ --gpus 1 \ - --num-workers 20 \ + --num-workers 10 \ --color --wandb --mixed-precision --compile-encoder # srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 18ad2296..f8fc1451 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -74,19 +74,19 @@ def make_image_transform(self, mode="train"): # if mode == "train": # elif mode == "val": - # augmentation_transform = transforms.default_transforms( - # crop_scale_bounds=self.crop_scale_bounds, - # crop_ratio_bounds=self.crop_ratio_bounds, - # resize_after_crop=self.resize_after_crop, - # pytorch_greyscale=not self.color - # ) # A.Compose object - - logging.warning('Minimal augmentations for speed test') - augmentation_transform = transforms.minimal_transforms( + augmentation_transform = transforms.default_transforms( + crop_scale_bounds=self.crop_scale_bounds, + crop_ratio_bounds=self.crop_ratio_bounds, resize_after_crop=self.resize_after_crop, pytorch_greyscale=not self.color ) # A.Compose object + # logging.warning('Minimal augmentations for speed test') + # augmentation_transform = transforms.fast_transforms( + # resize_after_crop=self.resize_after_crop, + # pytorch_greyscale=not self.color + # ) # A.Compose object + def do_transform(img): return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 7ee46bbf..0abff0a2 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -6,6 +6,9 @@ from itertools import islice import glob + +import albumentations as A + import tqdm import numpy as np import pandas as pd @@ -37,26 +40,76 @@ def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: in -def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int): +def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse_label_df=None): + assert '.tar' in save_loc df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') - + if sparse_label_df is not None: + logging.info(f'Using sparse label df: {len(sparse_label_df)}') shard_dfs = np.array_split(df, n_shards) logging.info(f'shards: {len(shard_dfs)}. Shard size: {len(shard_dfs[0])}') + + transforms_to_apply = [ + # below, for 224px fast training fast augs setup + # A.Resize( + # height=350, # now more aggressive, 65% crop effectively + # width=350, # now more aggressive, 65% crop effectively + # interpolation=cv2.INTER_AREA # slow and good interpolation + # ), + # A.CenterCrop( + # height=224, + # width=224, + # always_apply=True + # ), + # below, for standard training default augs + # small boundary trim and then resize expecting further 224px crop + # we want 0.7-0.8 effective crop + # in augs that could be 0.x-1.0, and here a pre-crop to 0.8 i.e. 340px + # but this would change the centering + # let's stick to small boundary crop and 0.75-0.85 in augs + A.CenterCrop( + height=400, + width=400, + always_apply=True + ), + A.Resize( + height=300, + width=300, + interpolation=cv2.INTER_AREA # slow and good interpolation + ) + ] + transform = A.Compose(transforms_to_apply) + # transform = None + for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): + if sparse_label_df is not None: + shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') logging.info(shard_save_loc) sink = wds.TarWriter(shard_save_loc) for _, galaxy in shard_df.iterrows(): - sink.write(galaxy_to_wds(galaxy, label_cols)) + sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) sink.close() -def galaxy_to_wds(galaxy: pd.Series, label_cols): +def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): im = cv2.imread(galaxy['file_loc']) # cv2 loads BGR for 'history', fix im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + # if central_crop is not None: + # width, height, _ = im.shape + # # assert width == height, (width, height) + # mid = int(width/2) + # half_central_crop = int(central_crop/2) + # low_edge, high_edge = mid - half_central_crop, mid + half_central_crop + # im = im[low_edge:high_edge, low_edge:high_edge] + # assert im.shape == (central_crop, central_crop, 3) + + # apply albumentations + if transform is not None: + im = transform(image=im)['image'] + labels = json.dumps(galaxy[label_cols].to_dict()) id_str = str(galaxy['id_str']) return { From 7d81c925b2945810e053c8a73afa2175fc84c891 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 13:24:55 -0500 Subject: [PATCH 155/307] typo --- only_for_me/narval/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 5f74f156..92dffb48 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -68,7 +68,7 @@ cache_dir = None else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_300px_2048/desi_labelled_train_*.tar' + search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_300px_2048/desi_labelled_300px_2048_train_*.tar' cache_dir = os.environ['SLURM_TMPDIR'] + '/cache' all_urls = glob.glob(search_str) From 751b5d6bf2cace0bf26f823670e8713275b9ad46 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 15:40:53 -0500 Subject: [PATCH 156/307] prediction adjustments --- only_for_me/narval/make_webdataset_script.py | 14 +- only_for_me/narval/train.py | 2 +- zoobot/pytorch/datasets/webdatamodule.py | 132 +++++++----------- zoobot/pytorch/datasets/webdataset_utils.py | 13 +- zoobot/pytorch/estimators/define_model.py | 5 + zoobot/pytorch/training/finetune.py | 16 ++- .../training/train_with_pytorch_lightning.py | 4 +- 7 files changed, 87 insertions(+), 99 deletions(-) diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py index 2113136a..b9c4be00 100644 --- a/only_for_me/narval/make_webdataset_script.py +++ b/only_for_me/narval/make_webdataset_script.py @@ -33,7 +33,7 @@ def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog save_loc = f"/home/walml/data/wds/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically - webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df) + webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=False) # webdataset_utils.load_wds_directly(save_loc) @@ -53,8 +53,8 @@ def main(): # for converting other catalogs e.g. DESI - dataset_name = 'desi_labelled_300px_2048' - # dataset_name = 'desi_all_2048' + # dataset_name = 'desi_labelled_300px_2048' + dataset_name = 'desi_all_300px_2048' label_cols = label_metadata.decals_all_campaigns_ortho_label_cols columns = [ 'dr8_id', 'brickid', 'objid', 'ra', 'dec' @@ -85,8 +85,8 @@ def main(): # print(len(df_dedup2)) # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - # df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') + # df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') + df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') df_dedup['id_str'] = df_dedup['dr8_id'] # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] @@ -95,8 +95,8 @@ def main(): # df_dedup_with_votes = pd.merge(df_dedup, votes, how='left', on='dr8_id') train_catalog, test_catalog = train_test_split(df_dedup, test_size=0.2, random_state=42) - train_catalog.to_parquet('/home/walml/data/wds/desi_labelled_300px_2048/train_catalog_v1.parquet', index=False) - test_catalog.to_parquet('/home/walml/data/wds/desi_labelled_300px_2048/test_catalog_v1.parquet', index=False) + train_catalog.to_parquet(f'/home/walml/data/wds/{dataset_name}/train_catalog_v1.parquet', index=False) + test_catalog.to_parquet(f'/home/walml/data/wds/{dataset_name}/test_catalog_v1.parquet', index=False) catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048, sparse_label_df=votes) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 92dffb48..486763a5 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -64,7 +64,7 @@ if os.path.isdir('/home/walml/repos/zoobot'): logging.warning('local mode') - search_str = '/home/walml/data/wds/desi_labelled_300px_2048/desi_labelled_train_*.tar' + search_str = '/home/walml/data/wds/desi_labelled_300px_2048/desi_labelled_300px_2048_train_*.tar' cache_dir = None else: diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index f8fc1451..cf40e629 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -14,11 +14,11 @@ class WebDataModule(pl.LightningDataModule): def __init__( self, - train_urls, - val_urls, + train_urls=None, + val_urls=None, + test_urls=None, + predict_urls=None, label_cols=None, - train_size=None, - val_size=None, # hardware batch_size=64, num_workers=4, @@ -31,21 +31,20 @@ def __init__( ): super().__init__() - # if isinstance(train_urls, types.GeneratorType): - # train_urls = list(train_urls) - # if isinstance(val_urls, types.GeneratorType): - # val_urls = list(val_urls) self.train_urls = train_urls self.val_urls = val_urls + self.test_urls = test_urls + self.predict_urls = predict_urls - if train_size is None: + if train_urls is not None: # assume the size of each shard is encoded in the filename as ..._{size}.tar - train_size = sum([int(url.rstrip('.tar').split('_')[-1]) for url in train_urls]) - if val_size is None: - val_size = sum([int(url.rstrip('.tar').split('_')[-1]) for url in val_urls]) - - self.train_size = train_size - self.val_size = val_size + self.train_size = interpret_dataset_size_from_urls(train_urls) + if val_urls is not None: + self.val_size = interpret_dataset_size_from_urls(val_urls) + if test_urls is not None: + self.test_size = interpret_dataset_size_from_urls(test_urls) + if predict_urls is not None: + self.predict_size = interpret_dataset_size_from_urls(predict_urls) self.label_cols = label_cols @@ -61,18 +60,14 @@ def __init__( self.crop_scale_bounds = crop_scale_bounds self.crop_ratio_bounds = crop_ratio_bounds + for url_name in ['train', 'val', 'test', 'predict']: + urls = getattr(self, f'{url_name}_urls') + if urls is not None: + logging.info(f"{url_name} (before hardware splits) = {len(urls)} e.g. {urls[0]}", ) - logging.info(f'Creating webdatamodule with WORLD_SIZE: {os.environ.get("WORLD_SIZE")}, RANK: {os.environ.get("RANK")}') - - logging.info(f"train_urls (before hardware splits) = {len(self.train_urls)} e.g. {self.train_urls[0]}", ) - logging.info(f"val_urls (before hardware splits) = {len(self.val_urls)} e.g. {self.val_urls[0]}", ) - # logging.info("train_size (before hardware splits) = ", self.train_size) - # logging.info("val_size (before hardware splits) = ", self.val_size) logging.info(f"batch_size: {self.batch_size}, num_workers: {self.num_workers}") def make_image_transform(self, mode="train"): - # if mode == "train": - # elif mode == "val": augmentation_transform = transforms.default_transforms( crop_scale_bounds=self.crop_scale_bounds, @@ -102,11 +97,11 @@ def label_transform(label_dict): def make_loader(self, urls, mode="train"): + dataset_size = getattr(self, f'{mode}_size') if mode == "train": - dataset_size = self.train_size - shuffle = min(self.train_size, 5000) - elif mode == "val": - dataset_size = self.val_size + shuffle = min(dataset_size, 5000) + else: + assert mode in ['val', 'test', 'predict'], mode shuffle = 0 transform_image = self.make_image_transform(mode=mode) @@ -120,21 +115,20 @@ def make_loader(self, urls, mode="train"): ) .shuffle(shuffle) .decode("rgb") - .to_tuple('image.jpg', 'labels.json') - .map_tuple(transform_image, transform_label) - # torch collate stacks dicts nicely while webdataset only lists them - # so use the torch collate instead - .batched(self.batch_size, torch.utils.data.default_collate, partial=False) - # .repeat(5) ) + if mode == 'predict': + # dataset = dataset.extract_keys('image.jpg').map(transform_image) + dataset = dataset.to_tuple('image.jpg').map_tuple(transform_image) # (im,) tuple. But map applied to all elements + # .map(get_first) + else: + dataset = ( + dataset.to_tuple('image.jpg', 'labels.json') + .map_tuple(transform_image, transform_label) + ) - # from itertools import islice - # for batch in islice(dataset, 0, 3): - # images, labels = batch - # # print(len(sample)) - # print(images.shape) - # print(len(labels)) # list of dicts - # # exit() + # torch collate stacks dicts nicely while webdataset only lists them + # so use the torch collate instead + dataset = dataset.batched(self.batch_size, torch.utils.data.default_collate, partial=False) loader = wds.WebLoader( dataset, @@ -145,17 +139,13 @@ def make_loader(self, urls, mode="train"): prefetch_factor=self.prefetch_factor ) - # print('sampling') - # for sample in islice(loader, 0, 3): - # images, labels = sample - # print(images.shape) - # print(len(labels)) # list of dicts - # exit() - loader.length = dataset_size // self.batch_size # temp hack instead - assert dataset_size % self.batch_size == 0, (dataset_size, self.batch_size, dataset_size % self.batch_size) + if mode in ['train', 'val']: + assert dataset_size % self.batch_size == 0, (dataset_size, self.batch_size, dataset_size % self.batch_size) + # for test/predict, always single GPU anyway + # if mode == "train": # ensure same number of batches in all clients # loader = loader.ddp_equalize(dataset_size // self.batch_size) @@ -168,32 +158,14 @@ def train_dataloader(self): def val_dataloader(self): return self.make_loader(self.val_urls, mode="val") - - # @staticmethod - # def add_loader_specific_args(parser): - # parser.add_argument("-b", "--batch-size", type=int, default=128) - # parser.add_argument("--workers", type=int, default=6) - # parser.add_argument("--bucket", default="./shards") - # parser.add_argument("--shards", default="imagenet-train-{000000..001281}.tar") - # parser.add_argument("--valshards", default="imagenet-val-{000000..000006}.tar") - # return parser - -# def nodesplitter_func(urls): # SimpleShardList -# # print(urls) -# try: -# node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size() -# urls_to_use = list(urls)[node_id::node_count] -# logging.info(f'id: {node_id}, of count {node_count}. \nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\n\n') -# return urls_to_use -# except RuntimeError: -# # print('Distributed not initialised. Hopefully single node.') -# return urls + + def predict_dataloader(self): + return self.make_loader(self.predict_urls, mode="predict") def identity(x): return x def nodesplitter_func(urls): - # num_urls = len(list(urls.copy())) urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() logging.info( @@ -205,14 +177,16 @@ def nodesplitter_func(urls): ) return urls_to_use +def interpret_shard_size_from_url(url): + return int(url.rstrip('.tar').split('_')[-1]) + +def interpret_dataset_size_from_urls(urls): + return sum([interpret_shard_size_from_url(url) for url in urls]) + +def get_first(x): + return x[0] -# def split_by_worker(urls): -# rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() -# if num_workers > 1: -# logging.info(f'Slicing urls for rank {rank}, world_size {world_size}, worker {worker}') -# for s in islice(urls, worker, None, num_workers): -# yield s -# else: -# logging.warning('only one worker?!') -# for s in urls: -# yield s +def custom_collate(x): + if isinstance(x, list) and len(x) == 1: + x = x[0] + return torch.utils.data.default_collate(x) \ No newline at end of file diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 0abff0a2..325a5864 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -40,7 +40,7 @@ def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: in -def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse_label_df=None): +def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse_label_df=None, overwrite=False): assert '.tar' in save_loc df['id_str'] = df['id_str'].astype(str).str.replace('.', '_') @@ -85,11 +85,12 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse if sparse_label_df is not None: shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') - logging.info(shard_save_loc) - sink = wds.TarWriter(shard_save_loc) - for _, galaxy in shard_df.iterrows(): - sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) - sink.close() + if overwrite or not(os.path.isfile(shard_save_loc)): + logging.info(shard_save_loc) + sink = wds.TarWriter(shard_save_loc) + for _, galaxy in shard_df.iterrows(): + sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) + sink.close() def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 9f26ba89..3f6e0d38 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -105,6 +105,11 @@ def on_test_batch_end(self, outputs, *args): def predict_step(self, batch, batch_idx, dataloader_idx=0): + # I can't work out how to get webdataset to return a single item im, not a tuple (im,). + # this is fine for training but annoying for predict + # help welcome. meanwhile, this works around it + if isinstance(batch, list) and len(batch) == 1: + return self(batch[0]) # https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#inference # this calls forward, while avoiding the need for e.g. model.eval(), torch.no_grad() # x, y = batch # would be usual format, but here, batch does not include labels diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 2d3a68d4..09272084 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -2,6 +2,7 @@ # https://github.com/inigoval/finetune/blob/main/finetune.py import logging import os +from typing import Any import warnings from functools import partial @@ -182,10 +183,6 @@ def configure_optimizers(self): "lr": lr * (self.lr_decay**i) }) - # TODO this actually breaks training because the generator only iterates once! - # total_params = sum(p.numel() for param_set in params.copy() for p in param_set['params']) - # logging.info('Total params to fit: {}'.format(total_params)) - # Initialize AdamW optimizer opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict @@ -219,6 +216,14 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): def test_step(self, batch, batch_idx, dataloader_idx=0): return self.make_step(batch) + + def predict_step(self, batch, batch_idx) -> Any: + # I can't work out how to get webdataset to return a single item im, not a tuple (im,). + # this is fine for training but annoying for predict + # help welcome. meanwhile, this works around it + if isinstance(batch, list) and len(batch) == 1: + return self(batch[0]) + return self(batch) def on_train_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): # v2 docs currently do not show dataloader_idx as train argument so unclear if this will value be updated properly @@ -355,6 +360,9 @@ def on_test_batch_end(self, step_output, *args) -> None: def predict_step(self, x, batch_idx): + # see Abstract version + if isinstance(x, list) and len(x) == 1: + return self(x[0]) x = self.forward(x) # logits from LinearClassifier # then applies softmax return F.softmax(x, dim=1) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 42e63bd0..50c5759a 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -235,6 +235,7 @@ def train_default_zoobot_from_scratch( datamodule = webdatamodule.WebDataModule( train_urls=train_urls, val_urls=val_urls, + test_urls=test_urls, label_cols=schema.label_cols, # hardware batch_size=batch_size, @@ -245,8 +246,7 @@ def train_default_zoobot_from_scratch( color=color, crop_scale_bounds=crop_scale_bounds, crop_ratio_bounds=crop_ratio_bounds, - resize_after_crop=resize_after_crop, - # TODO pass through the rest + resize_after_crop=resize_after_crop ) datamodule.setup(stage='fit') From afae520f832f8cb678277f79a9639bb487dddf19 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:09:59 -0500 Subject: [PATCH 157/307] try maxvit on desi only --- only_for_me/narval/train.sh | 18 ++++++++++++------ zoobot/pytorch/datasets/webdataset_utils.py | 8 +++++--- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index ec66aab8..f77b262c 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -20,14 +20,20 @@ export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use t # echo "r$SLURM_NODEID Launching python script" REPO_DIR=/project/def-bovy/walml/zoobot +# srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ +# --save-dir $REPO_DIR/only_for_me/narval/desi_300px_f128_1gpu \ +# --batch-size 256 \ +# --num-features 128 \ +# --gpus 1 \ +# --num-workers 10 \ +# --color --wandb --mixed-precision --compile-encoder + srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_f128_1gpu \ - --batch-size 256 \ - --num-features 128 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_rw_224_1gpu \ + --batch-size 64 \ --gpus 1 \ --num-workers 10 \ + --architecture maxvit_tiny_rw_224 \ --color --wandb --mixed-precision --compile-encoder -# srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py - - # --architecture maxvit_small_tf_224 \ + # maxvit_small_tf_224 \ diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 325a5864..d4efe3b5 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -82,11 +82,13 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse # transform = None for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): - if sparse_label_df is not None: - shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') if overwrite or not(os.path.isfile(shard_save_loc)): - logging.info(shard_save_loc) + + if sparse_label_df is not None: + shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge + + # logging.info(shard_save_loc) sink = wds.TarWriter(shard_save_loc) for _, galaxy in shard_df.iterrows(): sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) From a393a922f55cd8a0d8edfe012392880ad36b3bc3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:15:01 -0500 Subject: [PATCH 158/307] tweaks --- only_for_me/narval/train.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 486763a5..0b07e9c7 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -97,6 +97,12 @@ else: wandb_logger = None + timm_kwargs = {} + if 'efficientnet' in args.architecture_name: + timm_kwargs.update({'drop_path_rate': 0.2}) + if args.num_features != 1280: + timm_kwargs.update({'num_features': args.num_features}) + train_with_pytorch_lightning.train_default_zoobot_from_scratch( save_dir=args.save_dir, schema=schema, @@ -104,7 +110,7 @@ val_urls = val_urls, test_urls = None, architecture_name=args.architecture_name, - timm_kwargs={'drop_path_rate': 0.2, 'num_features': args.num_features}, + timm_kwargs=timm_kwargs, batch_size=args.batch_size, epochs=epochs, # rely on early stopping patience=10, From e60118beb11d90681295d1be0e642142d2b7979b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:29:04 -0500 Subject: [PATCH 159/307] maxvit runs, try 4 gpu in case small batches break --- only_for_me/narval/train.sh | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f77b262c..065f7f7e 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:4 nvidia-smi @@ -14,10 +14,7 @@ PYTHON=/home/walml/envs/zoobot39_dev/bin/python mkdir $SLURM_TMPDIR/cache # mkdir /tmp/cache -export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. -# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. -# echo "r$SLURM_NODEID master: $MASTER_ADDR" -# echo "r$SLURM_NODEID Launching python script" +export NCCL_BLOCKING_WAIT=1 REPO_DIR=/project/def-bovy/walml/zoobot # srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ @@ -29,9 +26,9 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --color --wandb --mixed-precision --compile-encoder srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_rw_224_1gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_rw_224_4gpu \ --batch-size 64 \ - --gpus 1 \ + --gpus 4 \ --num-workers 10 \ --architecture maxvit_tiny_rw_224 \ --color --wandb --mixed-precision --compile-encoder From 4d680d2d58cc29719c21588d99c0c2d96036a521 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:34:22 -0500 Subject: [PATCH 160/307] fix race condition --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 50c5759a..daa5ae9b 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -114,7 +114,10 @@ def train_default_zoobot_from_scratch( assert save_dir is not None if not os.path.isdir(save_dir): - os.mkdir(save_dir) + try: + os.mkdir(save_dir) + except FileExistsError(): + pass # another gpu process may have just made it logging.info(f'Saving to {save_dir}') if color: From be5dda8a5e523e13a03d7d240a18615d49f2eb83 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:55:56 -0500 Subject: [PATCH 161/307] effnet v2 on 1 gpu --- only_for_me/narval/train.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 065f7f7e..890035d8 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -25,12 +25,22 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --num-workers 10 \ # --color --wandb --mixed-precision --compile-encoder + +# batch sizes +# v100 +# effnet b0 256 +# maxvittiny_rw_224 64 +# tf_efficientnetv2_b0 256? +# tf_efficientnetv2_s 64? +# pit_xs_224 256? +# pit_s_224 64? + srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_rw_224_4gpu \ - --batch-size 64 \ - --gpus 4 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_4gpu \ + --batch-size 256 \ + --gpus 1 \ --num-workers 10 \ - --architecture maxvit_tiny_rw_224 \ + --architecture tf_efficientnetv2_b0 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 012547a2e05d1a1dd2b62dbe91789cd16825cfa2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 17:56:10 -0500 Subject: [PATCH 162/307] typo --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 890035d8..c0d4c0a3 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -36,7 +36,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_s_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_1gpu \ --batch-size 256 \ --gpus 1 \ --num-workers 10 \ From 94d1d5dc876313fd3f507dc0e301a31c1f21275b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:01:30 -0500 Subject: [PATCH 163/307] try pit xs --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c0d4c0a3..411c862d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -30,17 +30,17 @@ REPO_DIR=/project/def-bovy/walml/zoobot # v100 # effnet b0 256 # maxvittiny_rw_224 64 -# tf_efficientnetv2_b0 256? +# tf_efficientnetv2_b0 256 - runs # tf_efficientnetv2_s 64? # pit_xs_224 256? # pit_s_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_1gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_1gpu \ --batch-size 256 \ --gpus 1 \ --num-workers 10 \ - --architecture tf_efficientnetv2_b0 \ + --architecture pit_xs_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From e1aebafb855ecb0da7be47e82d6c4a06674ad38b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:03:00 -0500 Subject: [PATCH 164/307] pit s 64 --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 411c862d..8a114b79 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -36,11 +36,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_s_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_1gpu \ - --batch-size 256 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_1gpu \ + --batch-size 64 \ --gpus 1 \ --num-workers 10 \ - --architecture pit_xs_224 \ + --architecture pit_s_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 8f188b915106e45f06ebde3f4ec53b8a4f0aa0cc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:03:49 -0500 Subject: [PATCH 165/307] notes on closing --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 8a114b79..fca84c3e 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -32,8 +32,8 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvittiny_rw_224 64 # tf_efficientnetv2_b0 256 - runs # tf_efficientnetv2_s 64? -# pit_xs_224 256? -# pit_s_224 64? +# pit_xs_224 256? 42681961 +# pit_s_224 64? 42681996 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_1gpu \ From b2dc8165555410ed8fdc82fb38696c6fdfe3ae64 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:22:03 -0500 Subject: [PATCH 166/307] pit s at b256 with 4 gpu --- only_for_me/narval/train.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index fca84c3e..bd63793c 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -4,7 +4,7 @@ #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:4 nvidia-smi @@ -30,15 +30,15 @@ REPO_DIR=/project/def-bovy/walml/zoobot # v100 # effnet b0 256 # maxvittiny_rw_224 64 -# tf_efficientnetv2_b0 256 - runs +# tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 # tf_efficientnetv2_s 64? -# pit_xs_224 256? 42681961 -# pit_s_224 64? 42681996 +# pit_xs_224 256 - 40%, could do 512 +# pit_s_224 64? - 20%, could do x4 -> 256 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_1gpu \ - --batch-size 64 \ - --gpus 1 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_b256_4gpu \ + --batch-size 256 \ + --gpus 4 \ --num-workers 10 \ --architecture pit_s_224 \ --color --wandb --mixed-precision --compile-encoder From 8f5f5f781c8b24f05ac06064cb2e58ae49979b95 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:22:56 -0500 Subject: [PATCH 167/307] typo --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index bd63793c..42b71015 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:4 From 8668f4c159817dabe85aebd52df1a5d3b11576df Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:23:43 -0500 Subject: [PATCH 168/307] xs 4 gpu why not --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 42b71015..1e07a85d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -36,11 +36,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_s_224 64? - 20%, could do x4 -> 256 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_b256_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_b256_4gpu \ --batch-size 256 \ --gpus 4 \ --num-workers 10 \ - --architecture pit_s_224 \ + --architecture pit_xs_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From e3819921102eb141f6aac03c45101bad413cfb42 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:25:26 -0500 Subject: [PATCH 169/307] 4gpu b0 why not --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1e07a85d..ccae4c27 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -36,11 +36,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_s_224 64? - 20%, could do x4 -> 256 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_b256_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu \ --batch-size 256 \ --gpus 4 \ --num-workers 10 \ - --architecture pit_xs_224 \ + --architecture efficientnet_b0 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 8a8bb01e59d157c792d56754a20e36cf6b9fe159 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:27:48 -0500 Subject: [PATCH 170/307] maxvit_rmlp_small_rw_224 --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index ccae4c27..4cb9006d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -34,13 +34,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # tf_efficientnetv2_s 64? # pit_xs_224 256 - 40%, could do 512 # pit_s_224 64? - 20%, could do x4 -> 256 +# maxvit_small_224 32 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu \ - --batch-size 256 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_224_4gpu \ + --batch-size 32 \ --gpus 4 \ --num-workers 10 \ - --architecture efficientnet_b0 \ + --architecture maxvit_rmlp_small_rw_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 6776e72b1f6385425f6cce22b82562fd8f9d4df3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:30:54 -0500 Subject: [PATCH 171/307] vit small --- only_for_me/narval/train.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 4cb9006d..26a0630d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -35,13 +35,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_xs_224 256 - 40%, could do 512 # pit_s_224 64? - 20%, could do x4 -> 256 # maxvit_small_224 32 +# vit_small_patch16_224 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_224_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_vit_small_patch16_224_4gpu \ --batch-size 32 \ --gpus 4 \ --num-workers 10 \ - --architecture maxvit_rmlp_small_rw_224 \ + --architecture vit_small_patch16_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 08df9995e8bf8f79ca6f0137e24dd5a3b8ac302a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 18:32:16 -0500 Subject: [PATCH 172/307] vit tiny --- only_for_me/narval/train.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 26a0630d..717ecab1 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -35,14 +35,15 @@ REPO_DIR=/project/def-bovy/walml/zoobot # pit_xs_224 256 - 40%, could do 512 # pit_s_224 64? - 20%, could do x4 -> 256 # maxvit_small_224 32 -# vit_small_patch16_224 +# vit_small_patch16_224 32? +# vit_tiny_patch16_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_vit_small_patch16_224_4gpu \ - --batch-size 32 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_vit_tiny_patch16_224_4gpu \ + --batch-size 64 \ --gpus 4 \ --num-workers 10 \ - --architecture vit_small_patch16_224 \ + --architecture vit_tiny_patch16_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From ed6ef43333910adcf3bb28ecfccaa99ffe16b00d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:16:05 -0500 Subject: [PATCH 173/307] retry v2b0 4gpu --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 717ecab1..9b749c08 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -39,11 +39,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # vit_tiny_patch16_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_vit_tiny_patch16_224_4gpu \ - --batch-size 64 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_4gpu \ + --batch-size 256 \ --gpus 4 \ --num-workers 10 \ - --architecture vit_tiny_patch16_224 \ + --architecture tf_efficientnetv2_b0 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From e22439daca65ecfe4b8cb66d6ec2a6b940e03fb3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:20:29 -0500 Subject: [PATCH 174/307] typo --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index daa5ae9b..3a12e423 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -116,7 +116,7 @@ def train_default_zoobot_from_scratch( if not os.path.isdir(save_dir): try: os.mkdir(save_dir) - except FileExistsError(): + except FileExistsError: pass # another gpu process may have just made it logging.info(f'Saving to {save_dir}') From 0018d88cd87720caaecceeae1bb81815aa6173e7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:23:35 -0500 Subject: [PATCH 175/307] normal effnet 4gpu redo --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 9b749c08..cc02f7fc 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -28,7 +28,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot # batch sizes # v100 -# effnet b0 256 +# efficientnet_b0 256 # maxvittiny_rw_224 64 # tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 # tf_efficientnetv2_s 64? @@ -39,11 +39,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # vit_tiny_patch16_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_tf_efficientnetv2_b0_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu \ --batch-size 256 \ --gpus 4 \ --num-workers 10 \ - --architecture tf_efficientnetv2_b0 \ + --architecture efficientnet_b0 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From cb157b5c17d8668d2947691a11d2c615d53bd903 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:25:08 -0500 Subject: [PATCH 176/307] pit xs 4gpu redo --- only_for_me/narval/train.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index cc02f7fc..dcb415c7 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -39,11 +39,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # vit_tiny_patch16_224 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu \ - --batch-size 256 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_4gpu \ + --batch-size 512 \ --gpus 4 \ --num-workers 10 \ - --architecture efficientnet_b0 \ + --architecture pit_xs_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 68eb38e9e77fb7cff00e6e7159d59de0d9994b42 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:27:11 -0500 Subject: [PATCH 177/307] maxvit_rmlp_small_rw --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index dcb415c7..00b152d2 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -37,13 +37,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_small_224 32 # vit_small_patch16_224 32? # vit_tiny_patch16_224 64? +# maxvit_rmlp_small_rw 32? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_xs_224_4gpu \ - --batch-size 512 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_4gpu \ + --batch-size 32 \ --gpus 4 \ --num-workers 10 \ - --architecture pit_xs_224 \ + --architecture maxvit_rmlp_small_rw \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 3be5f2a0e5f36c96d0c8467dae0f8c39009fd1bd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:29:52 -0500 Subject: [PATCH 178/307] pit s --- only_for_me/narval/train.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 00b152d2..b665adb1 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -32,19 +32,19 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvittiny_rw_224 64 # tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 # tf_efficientnetv2_s 64? -# pit_xs_224 256 - 40%, could do 512 -# pit_s_224 64? - 20%, could do x4 -> 256 +# pit_xs_224 512 +# pit_s_224 256 # maxvit_small_224 32 # vit_small_patch16_224 32? # vit_tiny_patch16_224 64? # maxvit_rmlp_small_rw 32? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_4gpu \ - --batch-size 32 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_4gpu \ + --batch-size 256 \ --gpus 4 \ --num-workers 10 \ - --architecture maxvit_rmlp_small_rw \ + --architecture pit_s_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From d6331883f70a77d3065bcf4f5bdabb4f7a7a91ac Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:41:46 -0500 Subject: [PATCH 179/307] convnext tiny 1gpu --- only_for_me/narval/train.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index b665adb1..a0198c25 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -38,13 +38,16 @@ REPO_DIR=/project/def-bovy/walml/zoobot # vit_small_patch16_224 32? # vit_tiny_patch16_224 64? # maxvit_rmlp_small_rw 32? +# https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE +# convnext_nano +# convnext_tiny srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_pit_s_224_4gpu \ - --batch-size 256 \ - --gpus 4 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_convnext_tiny_1gpu \ + --batch-size 128 \ + --gpus 1 \ --num-workers 10 \ - --architecture pit_s_224 \ + --architecture convnext_tiny \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 34a19f37870578a508162f5edd3a60c1312cb41e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:43:37 -0500 Subject: [PATCH 180/307] effnet b2 --- only_for_me/narval/train.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index a0198c25..74ea359b 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:4 nvidia-smi @@ -41,13 +41,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano # convnext_tiny +# efficientnet_b2 64 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_convnext_tiny_1gpu \ - --batch-size 128 \ - --gpus 1 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b2_4gpu \ + --batch-size 64 \ + --gpus 4 \ --num-workers 10 \ - --architecture convnext_tiny \ + --architecture efficientnet_b2 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 6eac15e55a8b2af6de89869cd71f6765d94cb173 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:44:29 -0500 Subject: [PATCH 181/307] convnext small --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 74ea359b..87927ee2 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -41,14 +41,15 @@ REPO_DIR=/project/def-bovy/walml/zoobot # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano # convnext_tiny -# efficientnet_b2 64 +efficientnet_b2 +# convnext_small 64 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b2_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_convnext_small_4gpu \ --batch-size 64 \ --gpus 4 \ --num-workers 10 \ - --architecture efficientnet_b2 \ + --architecture convnext_small \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From fdeec727ec68e6279108778acecfcaf735a8dc9f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 21:45:13 -0500 Subject: [PATCH 182/307] typo --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 87927ee2..7a1b618f 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -41,7 +41,7 @@ REPO_DIR=/project/def-bovy/walml/zoobot # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano # convnext_tiny -efficientnet_b2 +# efficientnet_b2 # convnext_small 64 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ From e763eb11ac79a30de0a889d0f4e0da23d6e2148f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 22:25:54 -0500 Subject: [PATCH 183/307] typo --- only_for_me/narval/train.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 7a1b618f..b04d5f9e 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -31,25 +31,25 @@ REPO_DIR=/project/def-bovy/walml/zoobot # efficientnet_b0 256 # maxvittiny_rw_224 64 # tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 -# tf_efficientnetv2_s 64? +# tf_efficientnetv2_s 64? TODO # pit_xs_224 512 # pit_s_224 256 # maxvit_small_224 32 -# vit_small_patch16_224 32? +# vit_small_patch16_224 32? 17%, too small. TODO 128 (but pure vit is probably not great) # vit_tiny_patch16_224 64? -# maxvit_rmlp_small_rw 32? +# maxvit_rmlp_small_rw_224 32? # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano # convnext_tiny # efficientnet_b2 -# convnext_small 64 +# convnext_small 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_convnext_small_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_224_4gpu \ --batch-size 64 \ --gpus 4 \ --num-workers 10 \ - --architecture convnext_small \ + --architecture maxvit_rmlp_small_rw_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 07d88ce3512f0cea7b4ad6e8638ed15fc5f23ff2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 22:48:03 -0500 Subject: [PATCH 184/307] try effnetb4 --- only_for_me/narval/train.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index b04d5f9e..68d5c0fc 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -40,16 +40,17 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_rmlp_small_rw_224 32? # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano -# convnext_tiny -# efficientnet_b2 -# convnext_small 64? +# convnext_tiny - 128 +# efficientnet_b2 - 32% at 64, can do 128 +# convnext_small 64 - 49.25%, MAYBE 128 +# efficientnet_b4 srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_small_rw_224_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b4_4gpu \ --batch-size 64 \ --gpus 4 \ --num-workers 10 \ - --architecture maxvit_rmlp_small_rw_224 \ + --architecture efficientnet_b4 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From b979139711501f369826060fb961228e24fc8b92 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 23:07:19 -0500 Subject: [PATCH 185/307] effnet b5 --- only_for_me/narval/train.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 68d5c0fc..6e7c4c74 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -37,20 +37,21 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_small_224 32 # vit_small_patch16_224 32? 17%, too small. TODO 128 (but pure vit is probably not great) # vit_tiny_patch16_224 64? -# maxvit_rmlp_small_rw_224 32? +# maxvit_rmlp_small_rw_224 64 (97% allocated and very good efficiency) # https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE # convnext_nano # convnext_tiny - 128 # efficientnet_b2 - 32% at 64, can do 128 # convnext_small 64 - 49.25%, MAYBE 128 -# efficientnet_b4 +# efficientnet_b4 - 48% at 64, could maybe do 128 +# efficientnet_b5 - 64? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b4_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b5_4gpu \ --batch-size 64 \ --gpus 4 \ --num-workers 10 \ - --architecture efficientnet_b4 \ + --architecture efficientnet_b5 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From 58540157f50ccce29fa17472cfd4c395d371b76a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 23:08:15 -0500 Subject: [PATCH 186/307] maxvit_rmlp_base_rw_224 --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 6e7c4c74..ce0ed0a7 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -45,13 +45,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # convnext_small 64 - 49.25%, MAYBE 128 # efficientnet_b4 - 48% at 64, could maybe do 128 # efficientnet_b5 - 64? +# maxvit_rmlp_base_rw_224 - 32? srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b5_4gpu \ - --batch-size 64 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu \ + --batch-size 32 \ --gpus 4 \ --num-workers 10 \ - --architecture efficientnet_b5 \ + --architecture maxvit_rmlp_base_rw_224 \ --color --wandb --mixed-precision --compile-encoder # maxvit_small_tf_224 \ From fc89502b36ed616bf74e9cf5ed56599e052b60dc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 23:30:31 -0500 Subject: [PATCH 187/307] maxvit_rmlp_base_rw_224 on 16 gpus, just becase --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index ce0ed0a7..4f34818c 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --time=23:30:0 -#SBATCH --nodes=1 +#SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G @@ -44,13 +44,14 @@ REPO_DIR=/project/def-bovy/walml/zoobot # efficientnet_b2 - 32% at 64, can do 128 # convnext_small 64 - 49.25%, MAYBE 128 # efficientnet_b4 - 48% at 64, could maybe do 128 -# efficientnet_b5 - 64? -# maxvit_rmlp_base_rw_224 - 32? +# efficientnet_b5 - 64. remember it expects bigger images tho, may not work great +# maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu \ --batch-size 32 \ --gpus 4 \ + --nodes 4 \ --num-workers 10 \ --architecture maxvit_rmlp_base_rw_224 \ --color --wandb --mixed-precision --compile-encoder From 5a77f5d6f8ede7a928a26250102e812924f9ce41 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 23:42:45 -0500 Subject: [PATCH 188/307] cache dir on multinode --- only_for_me/narval/train.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 0b07e9c7..a726f7e1 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -70,6 +70,11 @@ else: search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_300px_2048/desi_labelled_300px_2048_train_*.tar' cache_dir = os.environ['SLURM_TMPDIR'] + '/cache' + if not os.path.isdir(cache_dir): + try: + os.mkdir(cache_dir) + except FileExistsError: + pass # race condition all_urls = glob.glob(search_str) assert len(all_urls) > 0, search_str From 93285be29935b2e93dc88976183c750ead9402ca Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 23 Nov 2023 23:56:31 -0500 Subject: [PATCH 189/307] 5 worker version --- only_for_me/narval/train.py | 3 +++ only_for_me/narval/train.sh | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index a726f7e1..0bca165e 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -81,6 +81,9 @@ # train_urls, val_urls = all_urls[:70], all_urls[70:] # train_urls, val_urls = all_urls[:60], all_urls[60:70] train_urls, val_urls = all_urls[:120], all_urls[120:140] # all num shards must be divisible by workers * gpus e.g. 10*1, 10*2 + # train_urls, val_urls = all_urls[:112], all_urls[112:140] # divisible by 16 + train_urls = train_urls * 4 + val_urls = val_urls * 4 schema = schemas.decals_all_campaigns_ortho_schema # debug mode diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 4f34818c..554c44d5 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -48,11 +48,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_5w \ --batch-size 32 \ --gpus 4 \ --nodes 4 \ - --num-workers 10 \ + --num-workers 5 \ --architecture maxvit_rmlp_base_rw_224 \ --color --wandb --mixed-precision --compile-encoder From 076be8ee1d4e4a45cf99ef68f8c1217a7154db61 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 24 Nov 2023 10:09:52 -0500 Subject: [PATCH 190/307] ran but hit nans, add gradient clip and w=0.05 --- only_for_me/narval/train.py | 12 +++++++++++- only_for_me/narval/train.sh | 13 +++++++------ .../training/train_with_pytorch_lightning.py | 3 ++- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 0bca165e..302a57a2 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -42,6 +42,10 @@ default=False, action='store_true') parser.add_argument('--wandb', dest='wandb', default=False, action='store_true') + parser.add_argument('--weight-decay', dest='weight_decay', + default=0.01, type=float) + parser.add_argument('--learning-rate', dest='learning_rate', + default=1e-3, type=float) parser.add_argument('--seed', dest='random_state', default=1, type=int) args = parser.parse_args() @@ -84,6 +88,11 @@ # train_urls, val_urls = all_urls[:112], all_urls[112:140] # divisible by 16 train_urls = train_urls * 4 val_urls = val_urls * 4 + import random + random.shuffle(train_urls) + random.shuffle(val_urls) + # 120 * 4 = 480. 480 / 5 / 16 = 8 :) + # 20 * 4 = 80. 80 / 5 / 16 = 1 :) schema = schemas.decals_all_campaigns_ortho_schema # debug mode @@ -135,7 +144,8 @@ num_workers=args.num_workers, compile_encoder=args.compile_encoder, # NEW random_state=random_state, - learning_rate=1e-3, + learning_rate=args.learning_rate, + weight_decay=args.weight_decay, cache_dir=cache_dir, crop_scale_bounds=(0.75, 0.85) # slightly increased to compensate for 424-400px crop when saving webdataset # cache_dir='/tmp/cache' diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 554c44d5..d1743bf0 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:1 nvidia-smi @@ -48,11 +48,12 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_5w \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_debug \ --batch-size 32 \ - --gpus 4 \ - --nodes 4 \ + --gpus 1 \ + --nodes 1 \ --num-workers 5 \ + --weight-decay 0.05 \ --architecture maxvit_rmlp_base_rw_224 \ --color --wandb --mixed-precision --compile-encoder diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 3a12e423..a51a7be2 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -303,7 +303,8 @@ def train_default_zoobot_from_scratch( callbacks=callbacks, max_epochs=epochs, default_root_dir=save_dir, - plugins=plugins + plugins=plugins, + gradient_clip_val=1. # new, for large models # , # limit_train_batches=1, # limit_val_batches=1 From 1ec59bd6b90f196cba0d77caa8a0b871d9077a35 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 24 Nov 2023 11:51:34 -0500 Subject: [PATCH 191/307] runs, restart w/ 4 gpu --- only_for_me/narval/train.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index d1743bf0..e095dac5 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --time=23:30:0 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G #SBATCH --gres=gpu:v100:1 @@ -50,8 +50,8 @@ REPO_DIR=/project/def-bovy/walml/zoobot srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_debug \ --batch-size 32 \ - --gpus 1 \ - --nodes 1 \ + --gpus 4 \ + --nodes 4 \ --num-workers 5 \ --weight-decay 0.05 \ --architecture maxvit_rmlp_base_rw_224 \ From 1cdc977db756768d8999f4f059fc9c10d2aa2cd1 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 24 Nov 2023 14:50:30 -0500 Subject: [PATCH 192/307] wds support for ZoobotEncoder --- zoobot/pytorch/training/representations.py | 2 ++ zoobot/shared/load_predictions.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/representations.py b/zoobot/pytorch/training/representations.py index f3f577cc..dd8912a1 100644 --- a/zoobot/pytorch/training/representations.py +++ b/zoobot/pytorch/training/representations.py @@ -12,4 +12,6 @@ def __init__(self, encoder, pyramid=False) -> None: raise NotImplementedError('Will eventually support resetting timm classifier to get FPN features') def forward(self, x): + if isinstance(x, list) and len(x) == 1: + return self(x[0]) return self.encoder(x) diff --git a/zoobot/shared/load_predictions.py b/zoobot/shared/load_predictions.py index 9373b488..fbaa9548 100644 --- a/zoobot/shared/load_predictions.py +++ b/zoobot/shared/load_predictions.py @@ -72,7 +72,7 @@ def load_hdf5s(hdf5_locs: List): 'id_str': [p for metadata in prediction_metadata for p in metadata['id_str']], 'hdf5_loc': [l for metadata in prediction_metadata for l in metadata['hdf5_loc']] } - assert len(prediction_metadata['id_str']) == len(predictions) + assert len(prediction_metadata['id_str']) == len(predictions), (len(prediction_metadata['id_str']), len(predictions)) galaxy_id_df = pd.DataFrame(data=prediction_metadata) @@ -163,10 +163,12 @@ def prediction_hdf5_to_summary_parquet(hdf5_loc: str, save_loc: str, schema: sch upper_edge_cols = [col + '_90pc-upper' for col in label_cols] proportion_asked_cols = [col + '_proportion-asked' for col in label_cols] + # make friendly dataframe with just masked fraction and description string friendly_loc = save_loc.replace('.parquet', '_friendly.parquet') fraction_df = pd.DataFrame(data=masked_fractions, columns=fraction_cols) friendly_df = pd.concat([galaxy_id_df, fraction_df], axis=1) + friendly_df = convert_halfprecision_cols(friendly_df) friendly_df.to_parquet(friendly_loc, index=False) logging.info('Friendly summary table saved to {}'.format(friendly_loc)) @@ -177,10 +179,18 @@ def prediction_hdf5_to_summary_parquet(hdf5_loc: str, save_loc: str, schema: sch upper_edge_df = pd.DataFrame(data=all_upper_edges, columns=upper_edge_cols) proportion_df = pd.DataFrame(data=prob_of_asked_by_answer, columns=proportion_asked_cols) advanced_df = pd.concat([galaxy_id_df, fraction_df, lower_edge_df, upper_edge_df, proportion_df], axis=1) + advanced_df = convert_halfprecision_cols(advanced_df) advanced_df.to_parquet(advanced_loc, index=False) logging.info('Advanced summary table saved to {}'.format(advanced_loc)) +def convert_halfprecision_cols(df): + # convert any half-precision columns, parquet can't save these + half_floats = df.select_dtypes(include="float16") + df[half_floats.columns] = half_floats.astype("float32") + return df + + def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False): """ Load predictions (or representations) saved as hdf5 into pd.DataFrame with id_str and label_cols columns From 754d003fed33c9670997931283572ef05c1e54b2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 24 Nov 2023 15:14:55 -0500 Subject: [PATCH 193/307] autofill label cols --- zoobot/shared/save_predictions.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/zoobot/shared/save_predictions.py b/zoobot/shared/save_predictions.py index 5505ecc9..5e43e592 100755 --- a/zoobot/shared/save_predictions.py +++ b/zoobot/shared/save_predictions.py @@ -10,6 +10,9 @@ def predictions_to_hdf5(predictions, id_str, label_cols, save_loc, compression="gzip"): logging.info(f'Saving predictions to {save_loc}') assert save_loc.endswith('.hdf5') + if label_cols is None: + label_cols = get_default_label_cols(predictions) + # sometimes throws a "could not lock file" error but still saves fine. I don't understand why with h5py.File(save_loc, "w") as f: f.create_dataset(name='predictions', data=predictions, compression=compression) # https://docs.h5py.org/en/stable/special.html#h5py.string_dtype @@ -17,12 +20,13 @@ def predictions_to_hdf5(predictions, id_str, label_cols, save_loc, compression=" # predictions_dset.attrs['label_cols'] = label_cols # would be more conventional but is a little awkward f.create_dataset(name='id_str', data=id_str, dtype=dt) f.create_dataset(name='label_cols', data=label_cols, dtype=dt) - # sometimes throws a "could not lock file" error but still saves fine. I don't understand why - def predictions_to_csv(predictions, id_str, label_cols, save_loc): + # not recommended - hdf5 is much more flexible and pretty easy to use once you check the package quickstart assert save_loc.endswith('.csv') + if label_cols is None: + label_cols = get_default_label_cols(predictions) data = [prediction_to_row(predictions[n], id_str[n], label_cols=label_cols) for n in range(len(predictions))] predictions_df = pd.DataFrame(data) # logging.info(predictions_df) @@ -57,3 +61,8 @@ def prediction_to_row(prediction: np.ndarray, id_str: str, label_cols: List): else: row[answer + '_pred'] = json.dumps(list(answer_pred)) # it's not a scalar, write as json return row + +def get_default_label_cols(predictions): + logging.warning('No label_cols passed - using default names e.g. feat_0, feat_1...') + label_cols = [f'feat_{n}' for n in range(predictions.shape[1])] + return label_cols From 2835325b6f9a9074170bf405db7a3e612ee8789d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 24 Nov 2023 17:06:10 -0500 Subject: [PATCH 194/307] debug mismatched shuffle --- zoobot/pytorch/datasets/webdatamodule.py | 26 ++++++++++++++++-------- zoobot/shared/load_predictions.py | 2 ++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index cf40e629..09c949a5 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -97,6 +97,8 @@ def label_transform(label_dict): def make_loader(self, urls, mode="train"): + logging.info('Making loader with mode {}'.format(mode)) + dataset_size = getattr(self, f'{mode}_size') if mode == "train": shuffle = min(dataset_size, 5000) @@ -108,18 +110,24 @@ def make_loader(self, urls, mode="train"): transform_label = self.make_label_transform() - dataset = ( + dataset = wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) # https://webdataset.github.io/webdataset/multinode/ # WDS 'knows' which worker it is running on and selects a subset of urls accordingly - wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func - ) - .shuffle(shuffle) - .decode("rgb") - ) + + if shuffle > 0: + dataset = dataset.shuffle(shuffle) + + dataset = dataset.decode("rgb") + if mode == 'predict': - # dataset = dataset.extract_keys('image.jpg').map(transform_image) - dataset = dataset.to_tuple('image.jpg').map_tuple(transform_image) # (im,) tuple. But map applied to all elements - # .map(get_first) + if self.label_cols != ['id_str']: + logging.info('Will return images only') + # dataset = dataset.extract_keys('image.jpg').map(transform_image) + dataset = dataset.to_tuple('image.jpg').map_tuple(transform_image) # (im,) tuple. But map applied to all elements + # .map(get_first) + else: + logging.info('Will return id_str only') + dataset = dataset.to_tuple('__key__') else: dataset = ( dataset.to_tuple('image.jpg', 'labels.json') diff --git a/zoobot/shared/load_predictions.py b/zoobot/shared/load_predictions.py index fbaa9548..db9876cf 100644 --- a/zoobot/shared/load_predictions.py +++ b/zoobot/shared/load_predictions.py @@ -49,6 +49,7 @@ def load_hdf5s(hdf5_locs: List): 'id_str': f['id_str'].asstr()[:], 'hdf5_loc': [os.path.basename(loc) for _ in these_predictions] } + assert len(these_predictions) == len(these_prediction_metadata['id_str']), (loc, len(these_predictions), len(these_prediction_metadata['id_str']) ) predictions.append(these_predictions) # will create a list where each element is 3D predictions stored in each hdf5 prediction_metadata.append(these_prediction_metadata) # also track id_str, similarly @@ -223,6 +224,7 @@ def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False): # copy over metadata (indices will align) prediction_df['id_str'] = galaxy_id_df['id_str'] prediction_df['hdf5_loc'] = galaxy_id_df['hdf5_loc'] + prediction_df = convert_halfprecision_cols(prediction_df) return prediction_df From d0b7ddd5d37fd75e220a81a66fca919f4325bc03 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 25 Nov 2023 09:50:01 -0500 Subject: [PATCH 195/307] rmlp base 4gpu --- only_for_me/narval/train.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index e095dac5..e7324790 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 -#SBATCH --nodes=4 +#SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:4 nvidia-smi @@ -48,13 +48,15 @@ REPO_DIR=/project/def-bovy/walml/zoobot # maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_debug \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005 \ --batch-size 32 \ --gpus 4 \ - --nodes 4 \ + --nodes 1 \ --num-workers 5 \ --weight-decay 0.05 \ --architecture maxvit_rmlp_base_rw_224 \ - --color --wandb --mixed-precision --compile-encoder + --color --wandb --mixed-precision + + # --compile-encoder # maxvit_small_tf_224 \ From fda028aa4812285d78a521a60ded6831fac6f2c8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 25 Nov 2023 13:18:35 -0500 Subject: [PATCH 196/307] subset frac --- zoobot/shared/load_predictions.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/zoobot/shared/load_predictions.py b/zoobot/shared/load_predictions.py index db9876cf..874ae9f9 100644 --- a/zoobot/shared/load_predictions.py +++ b/zoobot/shared/load_predictions.py @@ -192,7 +192,7 @@ def convert_halfprecision_cols(df): return df -def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False): +def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False, subset_frac=None): """ Load predictions (or representations) saved as hdf5 into pd.DataFrame with id_str and label_cols columns @@ -208,9 +208,11 @@ def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False): _type_: _description_ """ galaxy_id_df, predictions, label_cols = load_hdf5s(hdf5_locs) + logging.info('HDF5s loaded.') predictions = predictions.squeeze() - + + if len(predictions.shape) > 2: if drop_extra_dims: predictions = predictions[:, :, 0] @@ -221,6 +223,14 @@ def single_forward_pass_hdf5s_to_df(hdf5_locs: List, drop_extra_dims=False): I suggest using load_hdf5s directly to work with np.arrays, not with DataFrame - see docstring' ) prediction_df = pd.DataFrame(data=predictions, columns=label_cols) + + if subset_frac is not None: + logging.warning('Selecting a random subset: {}'.format(subset_frac)) + prediction_df = prediction_df.sample(frac=subset_frac, random_state=42) + + + del predictions + logging.info('Saving') # copy over metadata (indices will align) prediction_df['id_str'] = galaxy_id_df['id_str'] prediction_df['hdf5_loc'] = galaxy_id_df['hdf5_loc'] From 0c1d6e25276f4ae542429909d678c4bafac1fed5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 10:31:07 -0500 Subject: [PATCH 197/307] prepare for terrestrial --- only_for_me/narval/train.py | 5 +++++ only_for_me/narval/train.sh | 29 +++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py index 302a57a2..663b2880 100644 --- a/only_for_me/narval/train.py +++ b/only_for_me/narval/train.py @@ -25,6 +25,8 @@ # parser.add_argument('--data-dir', dest='data_dir', type=str) # parser.add_argument('--dataset', dest='dataset', type=str, help='dataset to use, either "gz_decals_dr5" or "gz_evo"') parser.add_argument('--architecture', dest='architecture_name', default='efficientnet_b0', type=str) + parser.add_argument('--accumulate-gradients', dest='acculumate_gradients', default=1, type=int) + parser.add_argument('--terrestrial-init', dest='terrestrial', default=False, action='store_true') parser.add_argument('--resize-after-crop', dest='resize_after_crop', type=int, default=224) parser.add_argument('--color', default=False, action='store_true') @@ -120,6 +122,9 @@ if args.num_features != 1280: timm_kwargs.update({'num_features': args.num_features}) + if args.terrestrial: + timm_kwargs.update({'pretrained': True}) + train_with_pytorch_lightning.train_default_zoobot_from_scratch( save_dir=args.save_dir, schema=schema, diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index e7324790..791180c8 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -23,7 +23,16 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --num-features 128 \ # --gpus 1 \ # --num-workers 10 \ -# --color --wandb --mixed-precision --compile-encoder +# --color --wandb --mixed-precision + +srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_1gpu \ + --batch-size 64 \ + --gpus 1 \ + --num-workers 10 \ + --color --wandb --mixed-precision + + # \ --compile-encoder # batch sizes @@ -47,15 +56,15 @@ REPO_DIR=/project/def-bovy/walml/zoobot # efficientnet_b5 - 64. remember it expects bigger images tho, may not work great # maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus -srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005 \ - --batch-size 32 \ - --gpus 4 \ - --nodes 1 \ - --num-workers 5 \ - --weight-decay 0.05 \ - --architecture maxvit_rmlp_base_rw_224 \ - --color --wandb --mixed-precision +# srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ +# --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005 \ +# --batch-size 32 \ +# --gpus 4 \ +# --nodes 1 \ +# --num-workers 5 \ +# --weight-decay 0.05 \ +# --architecture maxvit_rmlp_base_rw_224 \ +# --color --wandb --mixed-precision # --compile-encoder From 1e5fcc77e29288ff74be42cfeaa366ae211028fb Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 10:32:29 -0500 Subject: [PATCH 198/307] 1 gpu --- only_for_me/narval/train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 791180c8..dfba9f94 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=23:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=4 +#SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 +#SBATCH --gres=gpu:v100:1 nvidia-smi From 49db7f501b6d7b3e572dbf6d1f1a12b53a38cf58 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 10:44:45 -0500 Subject: [PATCH 199/307] typo --- only_for_me/narval/train.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index dfba9f94..1bb19d4b 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -26,8 +26,9 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --color --wandb --mixed-precision srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvittiny_1gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_1gpu \ --batch-size 64 \ + --architecture maxvit_tiny_rw_224 \ --gpus 1 \ --num-workers 10 \ --color --wandb --mixed-precision @@ -38,7 +39,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ # batch sizes # v100 # efficientnet_b0 256 -# maxvittiny_rw_224 64 +# maxvit_tiny_rw_224 64 # tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 # tf_efficientnetv2_s 64? TODO # pit_xs_224 512 From a7be25137642257cec2128f9ba9174a131e334f5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 10:52:22 -0500 Subject: [PATCH 200/307] request less time --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 1bb19d4b..7c02ef44 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=23:30:0 +#SBATCH --time=03:30:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=10 From ff72870ddda2bab337652dd4dedf84f122c25a70 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 11:03:04 -0500 Subject: [PATCH 201/307] try with terrestrial init, no norm --- only_for_me/narval/train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 7c02ef44..262db057 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -29,6 +29,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_1gpu \ --batch-size 64 \ --architecture maxvit_tiny_rw_224 \ + --terrestrial \ --gpus 1 \ --num-workers 10 \ --color --wandb --mixed-precision From 7873cf9c3ae3e67b745d0b6fcb13b6cf64e99e55 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 11:20:15 -0500 Subject: [PATCH 202/307] timm kwargs instead --- zoobot/pytorch/estimators/define_model.py | 4 ++-- zoobot/pytorch/training/train_with_pytorch_lightning.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 3f6e0d38..b327a7d7 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -147,7 +147,7 @@ def __init__( # encoder args architecture_name="efficientnet_b0", channels=1, - use_imagenet_weights=False, + # use_imagenet_weights=False, test_time_dropout=True, compile_encoder=False, timm_kwargs={}, # passed to timm.create_model e.g. drop_path_rate=0.2 for effnet @@ -189,7 +189,7 @@ def __init__( self.encoder = get_pytorch_encoder( architecture_name, channels, - use_imagenet_weights=use_imagenet_weights, + # use_imagenet_weights=use_imagenet_weights, **timm_kwargs ) if compile_encoder: diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index a51a7be2..d5cab9cb 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -260,7 +260,7 @@ def train_default_zoobot_from_scratch( question_index_groups=schema.question_index_groups, architecture_name=architecture_name, channels=channels, - use_imagenet_weights=False, + # use_imagenet_weights=False, test_time_dropout=True, dropout_rate=dropout_rate, learning_rate=learning_rate, From fe9ad7b6ceca506222668e1e50319855a7d43039 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 11:29:02 -0500 Subject: [PATCH 203/307] silly typo --- zoobot/pytorch/estimators/define_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b327a7d7..b4d784b2 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -131,7 +131,6 @@ class ZoobotTree(GenericLightningModule): question_index_groups (List): Mapping of which label indices are part of the same question. See :ref:`training_on_vote_counts`. architecture_name (str, optional): Architecture to use. Passed to timm. Must be in timm.list_models(). Defaults to "efficientnet_b0". channels (int, optional): Num. input channels. Probably 3 or 1. Defaults to 1. - use_imagenet_weights (bool, optional): Load weights pretrained on ImageNet (NOT galaxies!). Defaults to False. test_time_dropout (bool, optional): Apply dropout at test time, to pretend to be Bayesian. Defaults to True. timm_kwargs (dict, optional): passed to timm.create_model e.g. drop_path_rate=0.2 for effnet. Defaults to {}. learning_rate (float, optional): AdamW learning rate. Defaults to 1e-3. @@ -297,7 +296,7 @@ def get_encoder_dim(encoder, input_size, channels): def get_pytorch_encoder( architecture_name='efficientnet_b0', channels=1, - use_imagenet_weights=False, + # use_imagenet_weights=False, **timm_kwargs ) -> nn.Module: """ @@ -333,7 +332,7 @@ def get_pytorch_encoder( if architecture_name == 'efficientnet': logging.warning('efficientnet variant not specified - please set architecture_name=efficientnet_b0 (or similar)') architecture_name = 'efficientnet_b0' - return timm.create_model(architecture_name, in_chans=channels, num_classes=0, pretrained=use_imagenet_weights, **timm_kwargs) + return timm.create_model(architecture_name, in_chans=channels, num_classes=0, **timm_kwargs) def get_pytorch_dirichlet_head(encoder_dim: int, output_dim: int, test_time_dropout: bool, dropout_rate: float) -> torch.nn.Sequential: From bc667922364a4e3a067e0309a41e345a587b9db6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 12:07:05 -0500 Subject: [PATCH 204/307] 4 gpu for speed, if it starts --- only_for_me/narval/train.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 262db057..c882d1be 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --time=03:30:0 #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:1 +#SBATCH --gres=gpu:v100:4 nvidia-smi @@ -26,11 +26,11 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --color --wandb --mixed-precision srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_1gpu \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_4gpu_terr \ --batch-size 64 \ --architecture maxvit_tiny_rw_224 \ --terrestrial \ - --gpus 1 \ + --gpus 4 \ --num-workers 10 \ --color --wandb --mixed-precision From 027a40bacabe7a4709b71db60860eebfa8db0124 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 12:17:50 -0500 Subject: [PATCH 205/307] effnet terrestrial --- only_for_me/narval/train.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index c882d1be..3912a172 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -26,15 +26,13 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --color --wandb --mixed-precision srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_4gpu_terr \ - --batch-size 64 \ - --architecture maxvit_tiny_rw_224 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu_terr \ + --batch-size 256 \ + --architecture efficientnet_b0 \ --terrestrial \ --gpus 4 \ --num-workers 10 \ - --color --wandb --mixed-precision - - # \ --compile-encoder + --color --wandb --mixed-precision -compile-encoder # batch sizes From d5fdb6f8be0aebdc975868cc818a2303b53062d6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 12:50:52 -0500 Subject: [PATCH 206/307] dash missing --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 3912a172..505d403d 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -32,7 +32,7 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ --terrestrial \ --gpus 4 \ --num-workers 10 \ - --color --wandb --mixed-precision -compile-encoder + --color --wandb --mixed-precision --compile-encoder # batch sizes From ac47a5ab502928cd2888a68192a6496734a9e9e5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 12:51:12 -0500 Subject: [PATCH 207/307] little longer --- only_for_me/narval/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 505d403d..f5e3af98 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --time=03:30:0 +#SBATCH --time=06:30:0 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 From ab89eb9fa0d85c99c4c7e0579464f2707e5aa8be Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 12:59:21 -0500 Subject: [PATCH 208/307] maxvit overfit faster last time at higher batch size can I add a little weight decay and see if it helps --- only_for_me/narval/train.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index f5e3af98..224e91db 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -26,12 +26,13 @@ REPO_DIR=/project/def-bovy/walml/zoobot # --color --wandb --mixed-precision srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_efficientnet_b0_4gpu_terr \ - --batch-size 256 \ - --architecture efficientnet_b0 \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_4gpu_terr_w5 \ + --batch-size 64 \ + --architecture maxvit_tiny_rw_224 \ --terrestrial \ --gpus 4 \ --num-workers 10 \ + --weight-decay 0.05 \ --color --wandb --mixed-precision --compile-encoder From 31b146ff395bae8774917bb23998f5db4e60b680 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 30 Nov 2023 20:58:28 -0500 Subject: [PATCH 209/307] try base from terrestrial init --- only_for_me/narval/train.sh | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh index 224e91db..dc027b44 100644 --- a/only_for_me/narval/train.sh +++ b/only_for_me/narval/train.sh @@ -1,6 +1,6 @@ #!/bin/bash -#SBATCH --time=06:30:0 -#SBATCH --nodes=1 +#SBATCH --time=23:30:0 +#SBATCH --nodes=4 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=10 #SBATCH --mem-per-cpu 4G @@ -57,16 +57,15 @@ srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ # efficientnet_b5 - 64. remember it expects bigger images tho, may not work great # maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus -# srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ -# --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005 \ -# --batch-size 32 \ -# --gpus 4 \ -# --nodes 1 \ -# --num-workers 5 \ -# --weight-decay 0.05 \ -# --architecture maxvit_rmlp_base_rw_224 \ -# --color --wandb --mixed-precision - - # --compile-encoder +srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ + --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005_terr \ + --batch-size 32 \ + --gpus 4 \ + --nodes 4 \ + --num-workers 5 \ + --weight-decay 0.05 \ + --architecture maxvit_rmlp_base_rw_224 \ + --color --wandb --mixed-precision \ + --compile-encoder --terrestrial # maxvit_small_tf_224 \ From bea4b424788d1043bf6aeb9f2cd5b0f5cb4719bd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 8 Dec 2023 15:35:19 -0500 Subject: [PATCH 210/307] refactor narval changes out of zoobot --- only_for_me/narval/env_test.sh | 19 --- only_for_me/narval/finetune.py | 72 -------- only_for_me/narval/finetune.sh | 60 ------- only_for_me/narval/gpu_split.py | 132 --------------- only_for_me/narval/gpu_split.sh | 30 ---- only_for_me/narval/make_webdataset_script.py | 160 ------------------ only_for_me/narval/narval.md | 73 --------- only_for_me/narval/pytorch-ddp-test-pl.py | 78 --------- only_for_me/narval/pytorch-ddp-test-pl.sh | 20 --- only_for_me/narval/pytorch_test.py | 23 --- only_for_me/narval/requirements.txt | 32 ---- only_for_me/narval/train.py | 163 ------------------- only_for_me/narval/train.sh | 71 -------- zoobot/pytorch/datasets/webdataset_utils.py | 20 +++ 14 files changed, 20 insertions(+), 933 deletions(-) delete mode 100644 only_for_me/narval/env_test.sh delete mode 100644 only_for_me/narval/finetune.py delete mode 100644 only_for_me/narval/finetune.sh delete mode 100644 only_for_me/narval/gpu_split.py delete mode 100644 only_for_me/narval/gpu_split.sh delete mode 100644 only_for_me/narval/make_webdataset_script.py delete mode 100644 only_for_me/narval/narval.md delete mode 100644 only_for_me/narval/pytorch-ddp-test-pl.py delete mode 100644 only_for_me/narval/pytorch-ddp-test-pl.sh delete mode 100644 only_for_me/narval/pytorch_test.py delete mode 100644 only_for_me/narval/requirements.txt delete mode 100644 only_for_me/narval/train.py delete mode 100644 only_for_me/narval/train.sh diff --git a/only_for_me/narval/env_test.sh b/only_for_me/narval/env_test.sh deleted file mode 100644 index b472fae8..00000000 --- a/only_for_me/narval/env_test.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -#SBATCH --mem=16G -#SBATCH --nodes=1 -#SBATCH --time=0:5:0 -#SBATCH --ntasks-per-node=4 - -echo "$now" - -module load StdEnv/2020 -module load python/3.9.6 -virtualenv --no-download $SLURM_TMPDIR/env -source $SLURM_TMPDIR/env/bin/activate -pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt -cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ -cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ -pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets -pip install --no-deps -e $SLURM_TMPDIR/zoobot - -echo "$now" diff --git a/only_for_me/narval/finetune.py b/only_for_me/narval/finetune.py deleted file mode 100644 index cde0e6ca..00000000 --- a/only_for_me/narval/finetune.py +++ /dev/null @@ -1,72 +0,0 @@ -import logging -import os -import shutil - -from pytorch_lightning.loggers import WandbLogger - -from zoobot.pytorch.training import finetune -from galaxy_datasets import galaxy_mnist -from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule - - -if __name__ == '__main__': - - logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO) - logging.info('Begin') - - logging.info(os.environ['SLURM_TMPDIR']) - - # os.environ['NCCL_BLOCKING_WAIT'] = 1 - - # import glob - # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data'))) - # logging.info(glob.glob(os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'))) - - import torch - torch.set_float32_matmul_precision('medium') - assert torch.cuda.is_available() - - batch_size = 512 - num_workers = 10 - n_blocks = 3 # EffnetB0 is divided into 7 blocks. set 0 to only fit the head weights. Set 1, 2, etc to finetune deeper. - max_epochs = 60 # 6 epochs should get you ~93% accuracy. Set much higher (e.g. 1000) for harder problems, to use Zoobot's default early stopping. \ - - train_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=True) - test_catalog, _ = galaxy_mnist(root=os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/data/galaxy_mnist'), download=False, train=False) - logging.info('Data ready') - - label_cols = ['label'] - num_classes = 4 - - # load a pretrained checkpoint saved here - # rsync -avz --no-g --no-p /home/walml/repos/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt walml@narval.alliancecan.ca:/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch - checkpoint_loc = '/project/def-bovy/walml/zoobot/data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt' - - logger = WandbLogger(name='debug', save_dir='/project/def-bovy/walml/wandb/debug', project='narval', log_model=False, offline=True) - # logger = None - - datamodule = GalaxyDataModule( - label_cols=label_cols, - catalog=train_catalog, # very small, as a demo - batch_size=batch_size, # increase for faster training, decrease to avoid out-of-memory errors - num_workers=num_workers # TODO set to a little less than num. CPUs - ) - datamodule.setup() - model = finetune.FinetuneableZoobotClassifier( - checkpoint_loc=checkpoint_loc, - num_classes=num_classes, - n_blocks=n_blocks - ) - trainer = finetune.get_trainer( - os.path.join(os.environ['SLURM_TMPDIR'], 'walml/finetune/checkpoints'), - accelerator='gpu', - devices=2, - num_nodes=2, - strategy='ddp', - precision='16-mixed', - max_epochs=max_epochs, - enable_progress_bar=False, - logger=logger - ) - trainer.fit(model, datamodule) - # trainer.test(model, datamodule) diff --git a/only_for_me/narval/finetune.sh b/only_for_me/narval/finetune.sh deleted file mode 100644 index 665cfba2..00000000 --- a/only_for_me/narval/finetune.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -#SBATCH --mem=32G -#SBATCH --nodes=2 -#SBATCH --time=0:20:0 -#SBATCH --tasks-per-node=2 -#SBATCH --cpus-per-task=12 -#SBATCH --gres=gpu:a100:2 - -# https://github.com/webdataset/webdataset-lightning/blob/main/simple_cluster.py -#### SBATCH --mem=32G -#### SBATCH --nodes=1 -#### SBATCH --time=0:20:0 -#### SBATCH --tasks-per-node=2 -#### SBATCH --cpus-per-task=12 -#### SBATCH --gres=gpu:a100:2 - -#### -#### SBATCH --mem=16G -#### SBATCH --nodes=1 -#### SBATCH --time=0:10:0 -#### SBATCH --ntasks-per-node=8 -#### SBATCH --gres=gpu:a100:1 - -#### SBATCH --mail-user= -#### SBATCH --mail-type=ALL - -# module load StdEnv/2020 # CUDA etc -nvidia-smi - -PYTHON=/home/walml/envs/zoobot39_dev/bin/python - -# module load python/3.9.6 -# virtualenv --no-download $SLURM_TMPDIR/env -# source $SLURM_TMPDIR/env/bin/activate -# pip install --no-index -r /project/def-bovy/walml/zoobot/only_for_me/narval/requirements.txt -# cp -r /project/def-bovy/walml/galaxy-datasets $SLURM_TMPDIR/ -# cp -r /project/def-bovy/walml/zoobot $SLURM_TMPDIR/ -# pip install --no-deps -e $SLURM_TMPDIR/galaxy-datasets -# pip install --no-deps -e $SLURM_TMPDIR/zoobot - -mkdir $SLURM_TMPDIR/walml -mkdir $SLURM_TMPDIR/walml/finetune -mkdir $SLURM_TMPDIR/walml/finetune/data -mkdir $SLURM_TMPDIR/walml/finetune/checkpoints - -cp -r /project/def-bovy/walml/data/roots/galaxy_mnist $SLURM_TMPDIR/walml/finetune/data/ - -ls $SLURM_TMPDIR/walml/finetune/data/galaxy_mnist - -# wandb offline # only write metadata locally - -export NCCL_BLOCKING_WAIT=1 #Set this environment variable if you wish to use the NCCL backend for inter-GPU communication. -# export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable. -# echo "r$SLURM_NODEID master: $MASTER_ADDR" -# echo "r$SLURM_NODEID Launching python script" - -srun $PYTHON /project/def-bovy/walml/zoobot/only_for_me/narval/finetune.py -# srun python $SLURM_TMPDIR/zoobot/only_for_me/narval/finetune.py - -ls $SLURM_TMPDIR/walml/finetune/checkpoints diff --git a/only_for_me/narval/gpu_split.py b/only_for_me/narval/gpu_split.py deleted file mode 100644 index ff0ab328..00000000 --- a/only_for_me/narval/gpu_split.py +++ /dev/null @@ -1,132 +0,0 @@ -# import datetime -import logging -import os -import time -import glob - -import torch -import torch.nn.functional as F - -import pytorch_lightning as pl - -import argparse - -from zoobot.pytorch.datasets import webdataset_utils -from zoobot.shared import schemas -from zoobot.pytorch import datasets - -from torch import nn - -class ToyLightningModule(pl.LightningModule): - - def __init__(self): - super(ToyLightningModule, self).__init__() - - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - # pool again - # shape (B, F, H, W) - - def forward(self, x): - x = self.pool(nn.functional.relu(self.conv1(x))) - x = self.pool(nn.functional.relu(self.conv2(x))) - time.sleep(1) - return torch.mean(x, dim=(1, 2, 3)) # shape (B) - - - def training_step(self, batch, batch_idx): - images, labels = batch - y_hat = self(images) # mean after some convs - y = labels[:, 0].float() / 20. # first random number, divided by a big number to now be below 0 - loss = F.cross_entropy(y_hat, y) - return loss # meaningless but mathematically works - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=1e-3) - - -def main(): - - logging.basicConfig(level=logging.INFO) - logging.warning('Script start') - - parser = argparse.ArgumentParser() - parser.add_argument('--save-dir', dest='save_dir', type=str) - parser.add_argument('--batch-size', dest='batch_size', default=16, type=int) - parser.add_argument('--gpus', dest='gpus', default=1, type=int) - parser.add_argument('--nodes', dest='nodes', default=1, type=int) - parser.add_argument('--mixed-precision', dest='mixed_precision', - default=False, action='store_true') - parser.add_argument('--debug', dest='debug', - default=False, action='store_true') - parser.add_argument('--wandb', dest='wandb', - default=False, action='store_true') - parser.add_argument('--seed', dest='random_state', default=1, type=int) - args = parser.parse_args() - - # if os.path.isdir('/home/walml/repos/zoobot'): - save_dir = '/home/walml/repos/temp' - - # else: - # save_dir = os.environ['SLURM_TMPDIR'] - - schema = schemas.decals_all_campaigns_ortho_schema - - shards = webdataset_utils.make_mock_wds(save_dir, schema.label_cols, n_shards=10, shard_size=256) - # exit() - # webdataset_utils.load_wds_directly(shards[0], max_to_load=None) - # webdataset_utils.load_wds_with_webdatamodule(shards, label_cols=schema.label_cols, max_to_load=None) - shards = list(glob.glob('/home/walml/repos/temp/mock_shard_*_256.tar')) - # exit() - - train_shards = shards[:8] - val_shards = shards[8:] # not actually used - - datamodule = datasets.webdatamodule.WebDataModule( - train_urls=train_shards, - val_urls=val_shards, - batch_size=args.batch_size, - num_workers=1, - label_cols=schema.label_cols, - cache_dir=None - # TODO pass through the rest - ) - # use_distributed_sampler=False - - trainer = pl.Trainer( - # log_every_n_steps=16, # at batch 512 (A100 MP max), DR5 has ~161 train steps - accelerator='gpu', - devices=args.gpus, # per node - num_nodes=args.nodes, - # strategy='auto', - precision='32', - logger=False, - # callbacks=callbacks, - max_epochs=10, - default_root_dir=save_dir, - # plugins=plugins, - # use_distributed_sampler=use_distributed_sampler - ) - - # lightning_model = ToyLightningModule() - # lightning_model = torch.compile(lightning_model) - - from zoobot.pytorch.estimators import define_model - lightning_model = define_model.ZoobotTree( - output_dim=len(schema.label_cols), - question_index_groups=schema.question_index_groups, - architecture_name="efficientnet_b0", - channels=3, - compile_encoder=False # with compile on local desktop, 51 seconds for first epoch and 10 seconds thereafter. Without, 12 seconds for all epochs. - ) - # lightning_model = torch.compile(lightning_model) - - trainer.fit(lightning_model, datamodule) # uses batch size of datamodule - - # batch size 16 - # shard size 16, 10 shards with 8 being assigned as training shards so 8*32 train images, 8*2=16 train batches - - -if __name__=='__main__': - main() diff --git a/only_for_me/narval/gpu_split.sh b/only_for_me/narval/gpu_split.sh deleted file mode 100644 index ab99a982..00000000 --- a/only_for_me/narval/gpu_split.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -#SBATCH --time=0:10:0 -#SBATCH --nodes=2 # This needs to match Trainer(num_nodes=...) -#SBATCH --ntasks-per-node=4 # This needs to match Trainer(devices=...). This is PER TASK. -# srun is slurm's way to start many jobs from the same sbatch script -# the sbatch script runs *once* and then the srun command is called ntasks-per-node times on each node -# Lightning knows via env variables that it is running on slurm and identifies which DDP instance it should spin up -# webdatasets then reads from lighting with LOCAL_RANK worker we're on and loads the appropriate data -#SBATCH --cpus-per-task=4 -#SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 # This needs to match Trainer(devices=...). This is PER TASK. Total GPU = nodes*devices - -# https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_1.html#setup-the-training-script -# https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html - - - -nvidia-smi - -PYTHON=/home/walml/envs/zoobot39_dev/bin/python - -# mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache - -export NCCL_BLOCKING_WAIT=1 # "Set this environment variable if you wish to use the NCCL backend for inter-GPU communication." -# instructed by Compute Canada, not lightning - -echo 'Running script' -REPO_DIR=/project/def-bovy/walml/zoobot -srun $PYTHON $REPO_DIR/only_for_me/narval/gpu_split.py --gpus 4 --nodes 2 diff --git a/only_for_me/narval/make_webdataset_script.py b/only_for_me/narval/make_webdataset_script.py deleted file mode 100644 index b9c4be00..00000000 --- a/only_for_me/narval/make_webdataset_script.py +++ /dev/null @@ -1,160 +0,0 @@ -import logging - -import numpy as np -import pandas as pd -from astropy import units as u -from astropy.coordinates import SkyCoord - -from PIL import Image # necessary to avoid PIL.Image error assumption in web_datasets - -from galaxy_datasets.shared import label_metadata -from galaxy_datasets import gz2 - -from sklearn.model_selection import train_test_split - -from zoobot.pytorch.datasets import webdataset_utils - - -def dataset_to_webdataset(dataset_name, dataset_func, label_cols, divisor=4096): - - train_catalog, _ = dataset_func(root=f'/home/walml/data/galaxy-datasets/{dataset_name}', download=True, train=True) - test_catalog, _ = dataset_func(root=f'/home/walml/data/galaxy-datasets/{dataset_name}', download=False, train=False) - - catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=divisor) - - -def catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=4096): - for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: - n_shards = len(catalog) // divisor - logging.info(n_shards) - - catalog = catalog[:n_shards*divisor] - logging.info(len(catalog)) - - save_loc = f"/home/walml/data/wds/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically - - webdataset_utils.df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=False) - - # webdataset_utils.load_wds_directly(save_loc) - - # webdataset_utils.load_wds_with_augmentation(save_loc) - - # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) - - -def main(): - - # for converting my galaxy-dataset datasets - # dataset_name = 'gz2' - # dataset_func = gz2 - # label_cols = label_metadata.gz2_ortho_label_cols - # dataset_to_webdataset(dataset_name, label_cols, dataset_func) - - - - # for converting other catalogs e.g. DESI - # dataset_name = 'desi_labelled_300px_2048' - dataset_name = 'desi_all_300px_2048' - label_cols = label_metadata.decals_all_campaigns_ortho_label_cols - columns = [ - 'dr8_id', 'brickid', 'objid', 'ra', 'dec' - ] - # df = pd.read_parquet('/home/walml/repos/decals-rings/data/master_all_file_index_passes_file_checks.parquet', columns=columns) - # desi pipeline shreds sources. Be careful to deduplicate. - - columns = ['id_str'] + label_cols - votes = pd.concat([ - pd.read_parquet(f'/media/walml/beta/galaxy_zoo/decals/dr8/catalogs/training_catalogs/{campaign}_ortho_v5_labelled_catalog.parquet', columns=columns) - for campaign in ['dr12', 'dr5', 'dr8'] - ], axis=0) - assert votes['id_str'].value_counts().max() == 1, votes['id_str'].value_counts() - votes['dr8_id'] = votes['id_str'] - - # name = 'labelled' - # merge_strategy = {'labelled': 'inner', 'all': 'left'} - # df = pd.merge(df, votes[['dr8_id']], on='dr8_id', how=merge_strategy[name]) - - # df['relative_file_loc'] = df.apply(lambda x: f"{x['brickid']}/{x['brickid']}_{x['objid']}.jpg", axis=1) - # df['file_loc'] = '/home/walml/data/desi/jpg/' + df['relative_file_loc'] - - # df_dedup = remove_close_sky_matches(df) - # print(len(df_dedup)) - # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') - # exit() - # df_dedup2 = remove_close_sky_matches(df_dedup) - # print(len(df_dedup2)) - # df_dedup.to_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - - # df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_labelled_dedup_20arcsec.parquet') - df_dedup = pd.read_parquet('/home/walml/data/desi/master_all_file_index_all_dedup_20arcsec.parquet') - df_dedup['id_str'] = df_dedup['dr8_id'] - - # columns = ['id_str', 'smooth-or-featured-dr12_total-votes', 'smooth-or-featured-dr5_total-votes', 'smooth-or-featured-dr8_total-votes'] - - # gets too big, need to only merge in label_df per shard - # df_dedup_with_votes = pd.merge(df_dedup, votes, how='left', on='dr8_id') - - train_catalog, test_catalog = train_test_split(df_dedup, test_size=0.2, random_state=42) - train_catalog.to_parquet(f'/home/walml/data/wds/{dataset_name}/train_catalog_v1.parquet', index=False) - test_catalog.to_parquet(f'/home/walml/data/wds/{dataset_name}/test_catalog_v1.parquet', index=False) - - catalogs_to_webdataset(dataset_name, label_cols, train_catalog, test_catalog, divisor=2048, sparse_label_df=votes) - - - - - -def remove_close_sky_matches(df, seplimit=20*u.arcsec, col_to_prioritise='ra'): - - catalog = SkyCoord(ra=df['ra'].values * u.deg, dec=df['dec'].values * u.deg) - - search_coords = catalog - - logging.info('Beginning search for nearby galaxies') - idxc, idxcatalog, d2d, _ = catalog.search_around_sky(search_coords, seplimit=seplimit) - # idxc is index in search coords - # idxcatalog is index in catalog - # steps through all indexes in both that are within seplimit - # d2d gives the distance (not used here) - - # includes self-match, so remove these - idxc = idxc[d2d > 0] - idxcatalog = idxcatalog[d2d > 0] - d2d = d2d[d2d > 0] - - logging.info('Beginning drop prioritisation') - indices_to_drop = [] - for search_index_val in pd.unique(idxc): - matched_indices = idxcatalog[idxc == search_index_val] - matched_indices_including_self = matched_indices.tolist() + [search_index_val] - - # use RA as tiebreaker - matching_galaxies = df.iloc[matched_indices_including_self] - highest = matching_galaxies.index[np.argmax(matching_galaxies[col_to_prioritise])] - these_indices_to_drop = list(set(matched_indices_including_self) - set([highest])) - indices_to_drop += these_indices_to_drop - - indices_to_drop = set(indices_to_drop) - all_indices = np.arange(len(df)) # index is like this, for sure - indices_to_keep = set(all_indices) - indices_to_drop - df_dedup = df.iloc[list(indices_to_keep)] - return df_dedup - - - - - -if __name__ == '__main__': - - logging.basicConfig(level=logging.INFO) - - main() - - - - - - # df = df[:100000] - # df['total_votes'] = df['smooth-or-featured-dr12_total-votes'] + df['smooth-or-featured-dr5_total-votes'] + df['smooth-or-featured-dr8_total-votes'] - # df['total_votes'] = df['total_votes'].fillna(0) - # df['random'] = np.random.rand(len(df)) \ No newline at end of file diff --git a/only_for_me/narval/narval.md b/only_for_me/narval/narval.md deleted file mode 100644 index 6203868c..00000000 --- a/only_for_me/narval/narval.md +++ /dev/null @@ -1,73 +0,0 @@ - -Sources - -https://docs.alliancecan.ca/wiki/Narval/en -https://docs.alliancecan.ca/wiki/AI_and_Machine_Learning -https://docs.alliancecan.ca/wiki/PyTorch -https://prashp.gitlab.io/post/compute-canada-tut/ -https://docs.alliancecan.ca/wiki/Python - -ssh walml@narval.alliancecan.ca -ssh-copy-id to avoid password in future - - module purge - module avail - -Just for venv: - module load python/3.9.6 - - mkdir ~/envs - virtualenv --no-download ~/envs/zoobot39_dev - source ~/envs/zoobot39_dev/bin/activate - - avail_wheels "torch*" - -Latest is currently 2.0.1 (no 2.1.0 yet) - - pip install --no-index torch==2.0.1 torchvision torchtext torchaudio - pip install --no-index pytorch_lightning wandb - -Storage under /home/user is not ideal, 50gb space. Use /project/def-bovy/walml (1TB space). -Can transfer data via rsync login node. - -Move ssh key for easy login (run on LOCAL desktop) - - ssh-copy-id walml@narval.alliancecan.ca - -Make new pub id key for github (back on cluster) - - ssh-keygen -t rsa -b 4096 - cat ~/.ssh/id_rsa.pub -and add to [Github](https://github.com/settings/keys) as normal - -Set up repos - - - cd /project/def-bovy/walml - -(I made a .bashrc alias, export PROJECT=/project/def-bovy/walml) - -git checkout narval-migration (from zoobot) -pip install --no-index -r zoobot/only_for_me/narval/requirements.txt - -and my own cloned repos -pip install --no-deps -e galaxy-datasets -pip install --no-deps -e zoobot - -Run training - -sbatch only_for_me/narval/finetune.sh - -Works with simple images on multi-GPU, single node - -rsync -avz --no-g --no-p /home/walml/repos/zoobot/only_for_me/narval/gz_decals_5/*.tar walml@narval.computecanada.ca:projects/def-bovy/walml/data/webdatasets/gz_decals_5/full - - - -https://lightning.ai/docs/pytorch/stable/clouds/cluster_intermediate_2.html# -https://pytorch.org/docs/stable/elastic/run.html#environment-variables -https://github.com/webdataset/webdataset/issues/250 -https://github.com/webdataset/webdataset-lightning/blob/main/train.py -https://lightning.ai/forums/t/multi-gpu-multi-node-training-with-webdataset/2300 -https://webdataset.github.io/webdataset/multinode/ -https://webdataset.github.io/webdataset/creating/ \ No newline at end of file diff --git a/only_for_me/narval/pytorch-ddp-test-pl.py b/only_for_me/narval/pytorch-ddp-test-pl.py deleted file mode 100644 index eb35ed85..00000000 --- a/only_for_me/narval/pytorch-ddp-test-pl.py +++ /dev/null @@ -1,78 +0,0 @@ -# import datetime - -import torch -from torch import nn -import torch.nn.functional as F - -import pytorch_lightning as pl - -# import torchvision -import torchvision.transforms as transforms -from torchvision.datasets import CIFAR10 -from torch.utils.data import DataLoader - -import argparse - -parser = argparse.ArgumentParser(description='cifar10 classification models, pytorch-lightning parallel test') -parser.add_argument('--lr', default=0.1, help='') -parser.add_argument('--max_epochs', type=int, default=4, help='') -parser.add_argument('--batch_size', type=int, default=768, help='') -parser.add_argument('--num_workers', type=int, default=0, help='') - - -def main(): - print("Starting...") - - args = parser.parse_args() - - class Net(pl.LightningModule): - - def __init__(self): - super(Net, self).__init__() - - self.conv1 = nn.Conv2d(3, 6, 5) - self.pool = nn.MaxPool2d(2, 2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = self.pool(F.relu(self.conv1(x))) - x = self.pool(F.relu(self.conv2(x))) - x = x.view(-1, 16 * 5 * 5) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - def training_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - return loss - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=args.lr) - - net = Net() - - """ Here we initialize a Trainer() explicitly with 1 node and 2 GPUs per node. - To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs - and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. - We also set progress_bar_refresh_rate=0 to avoid writing a progress bar to the logs, - which can cause issues due to updating logs too frequently.""" - - trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy='ddp', max_epochs = args.max_epochs, enable_progress_bar=False) - - transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) - - dataset_train = CIFAR10(root='/project/def-bovy/walml/data/roots/cifar10', train=True, download=False, transform=transform_train) - - train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) - - trainer.fit(net,train_loader) - - -if __name__=='__main__': - main() \ No newline at end of file diff --git a/only_for_me/narval/pytorch-ddp-test-pl.sh b/only_for_me/narval/pytorch-ddp-test-pl.sh deleted file mode 100644 index 032731cb..00000000 --- a/only_for_me/narval/pytorch-ddp-test-pl.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -#SBATCH --nodes 1 -#SBATCH --gres=gpu:2 # Request 2 GPU "generic resources”. -#SBATCH --tasks-per-node=2 # Request 1 process per GPU. You will get 1 CPU per process by default. Request more CPUs with the "cpus-per-task" parameter to enable multiple data-loader workers to load data in parallel. -#SBATCH --mem=8G -#SBATCH --time=0-00:15 -#SBATCH --output=%N-%j.out - -module load python # Using Default Python version - Make sure to choose a version that suits your application -virtualenv --no-download $SLURM_TMPDIR/env -source $SLURM_TMPDIR/env/bin/activate -pip install torchvision pytorch-lightning --no-index - -export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. - -# PyTorch Lightning will query the environment to figure out if it is running inside a SLURM batch job -# If it is, it expects the user to have requested one task per GPU. -# If you do not ask for 1 task per GPU, and you do not run your script with "srun", your job will fail! - -srun python /project/def-bovy/walml/zoobot/only_for_me/narval/pytorch-ddp-test-pl.py --batch_size 256 diff --git a/only_for_me/narval/pytorch_test.py b/only_for_me/narval/pytorch_test.py deleted file mode 100644 index 9f156b8d..00000000 --- a/only_for_me/narval/pytorch_test.py +++ /dev/null @@ -1,23 +0,0 @@ -import torch -x = torch.Tensor(5, 3) -print(x) -y = torch.rand(5, 3) -print(y) -# let us run the following only if CUDA is available -if torch.cuda.is_available(): - x = x.cuda() - y = y.cuda() - print(x + y) -else: - raise AssertionError('CUDA not available') - -# TODO DemoRings from galaxy-datasets - -from galaxy_datasets import galaxy_mnist - -root = '/project/def-bovy/walml/data/roots/galaxy_mnist' - -df, label_cols = galaxy_mnist(root, train=True, download=False) # must be already downloaded, no internet - - -# TODO import zoobot and use something diff --git a/only_for_me/narval/requirements.txt b/only_for_me/narval/requirements.txt deleted file mode 100644 index 0daeb251..00000000 --- a/only_for_me/narval/requirements.txt +++ /dev/null @@ -1,32 +0,0 @@ -torch -torchvision -torchaudio -pytorch-lightning -albumentations -pyro-ppl -torchmetrics -timm -wandb -h5py -astropy -pandas -matplotlib -fastparquet # replacing pyarrow below -webdataset -# below already required by packages above - -# tqdm -# pillow -# numpy -# pandas -# scipy -# scikit-image -# scikit-learn -# matplotlib - -# not prebuilt, build fails! -# pyarrow - -# possibly trouble ahead with these two -# -# diff --git a/only_for_me/narval/train.py b/only_for_me/narval/train.py deleted file mode 100644 index 663b2880..00000000 --- a/only_for_me/narval/train.py +++ /dev/null @@ -1,163 +0,0 @@ -import logging -import os -import argparse -import glob - -from pytorch_lightning.loggers import WandbLogger -import wandb - -from zoobot.pytorch.training import train_with_pytorch_lightning -from zoobot.shared import benchmark_datasets, schemas - -import pytorch_lightning as pl - - -if __name__ == '__main__': - - """ - Used to create the PyTorch pretrained weights checkpoints - See .sh file of the same name for args used. - - See zoobot/pytorch/examples/minimal_examples.py for a friendlier example - """ - parser = argparse.ArgumentParser() - parser.add_argument('--save-dir', dest='save_dir', type=str, default='local_debug') - # parser.add_argument('--data-dir', dest='data_dir', type=str) - # parser.add_argument('--dataset', dest='dataset', type=str, help='dataset to use, either "gz_decals_dr5" or "gz_evo"') - parser.add_argument('--architecture', dest='architecture_name', default='efficientnet_b0', type=str) - parser.add_argument('--accumulate-gradients', dest='acculumate_gradients', default=1, type=int) - parser.add_argument('--terrestrial-init', dest='terrestrial', default=False, action='store_true') - parser.add_argument('--resize-after-crop', dest='resize_after_crop', - type=int, default=224) - parser.add_argument('--color', default=False, action='store_true') - parser.add_argument('--compile-encoder', dest='compile_encoder', default=False, action='store_true') - parser.add_argument('--batch-size', dest='batch_size', - default=16, type=int) - parser.add_argument('--num-features', dest='num_features', - default=1280, type=int) - parser.add_argument('--gpus', dest='gpus', default=1, type=int) - parser.add_argument('--nodes', dest='nodes', default=1, type=int) - parser.add_argument('--num-workers', dest='num_workers', default=1, type=int) - parser.add_argument('--mixed-precision', dest='mixed_precision', - default=False, action='store_true') - parser.add_argument('--debug', dest='debug', - default=False, action='store_true') - parser.add_argument('--wandb', dest='wandb', - default=False, action='store_true') - parser.add_argument('--weight-decay', dest='weight_decay', - default=0.01, type=float) - parser.add_argument('--learning-rate', dest='learning_rate', - default=1e-3, type=float) - parser.add_argument('--seed', dest='random_state', default=1, type=int) - args = parser.parse_args() - - """ - debug - python only_for_me/narval/train.py --save-dir only_for_me/narval/debug_models --batch-size 32 --color - """ - - logging.basicConfig(level=logging.INFO) - - random_state = args.random_state - pl.seed_everything(random_state) - - # if args.nodes > 1: - # # at Manchester, our slurm cluster sets TASKS not NTASKS, which then confuses lightning - # if 'SLURM_NTASKS_PER_NODE' not in os.environ.keys(): - # os.environ['SLURM_NTASKS_PER_NODE'] = os.environ['SLURM_TASKS_PER_NODE'] - # # log the rest to help debug - # logging.info([(x, y) for (x, y) in os.environ.items() if 'SLURM' in x]) - - if os.path.isdir('/home/walml/repos/zoobot'): - logging.warning('local mode') - search_str = '/home/walml/data/wds/desi_labelled_300px_2048/desi_labelled_300px_2048_train_*.tar' - cache_dir = None - - else: - search_str = '/home/walml/projects/def-bovy/walml/data/webdatasets/desi_labelled_300px_2048/desi_labelled_300px_2048_train_*.tar' - cache_dir = os.environ['SLURM_TMPDIR'] + '/cache' - if not os.path.isdir(cache_dir): - try: - os.mkdir(cache_dir) - except FileExistsError: - pass # race condition - - all_urls = glob.glob(search_str) - assert len(all_urls) > 0, search_str - # train_urls, val_urls = all_urls[:70], all_urls[70:] - # train_urls, val_urls = all_urls[:60], all_urls[60:70] - train_urls, val_urls = all_urls[:120], all_urls[120:140] # all num shards must be divisible by workers * gpus e.g. 10*1, 10*2 - # train_urls, val_urls = all_urls[:112], all_urls[112:140] # divisible by 16 - train_urls = train_urls * 4 - val_urls = val_urls * 4 - import random - random.shuffle(train_urls) - random.shuffle(val_urls) - # 120 * 4 = 480. 480 / 5 / 16 = 8 :) - # 20 * 4 = 80. 80 / 5 / 16 = 1 :) - schema = schemas.decals_all_campaigns_ortho_schema - - # debug mode - if args.debug: - logging.warning( - 'Using debug mode: cutting urls down to 2') - train_urls = train_urls[:2] - val_urls = val_urls[:2] - epochs = 1 - else: - epochs = 1000 - - if args.wandb: - wandb_logger = WandbLogger( - project='narval', - # name=os.path.basename(args.save_dir), - log_model=False - ) - else: - wandb_logger = None - - timm_kwargs = {} - if 'efficientnet' in args.architecture_name: - timm_kwargs.update({'drop_path_rate': 0.2}) - if args.num_features != 1280: - timm_kwargs.update({'num_features': args.num_features}) - - if args.terrestrial: - timm_kwargs.update({'pretrained': True}) - - train_with_pytorch_lightning.train_default_zoobot_from_scratch( - save_dir=args.save_dir, - schema=schema, - train_urls = train_urls, - val_urls = val_urls, - test_urls = None, - architecture_name=args.architecture_name, - timm_kwargs=timm_kwargs, - batch_size=args.batch_size, - epochs=epochs, # rely on early stopping - patience=10, - # augmentation parameters - # color=args.color, - color=args.color, - resize_after_crop=args.resize_after_crop, - # hardware parameters - gpus=args.gpus, - nodes=args.nodes, - mixed_precision=args.mixed_precision, - wandb_logger=wandb_logger, - prefetch_factor=1, # TODO - num_workers=args.num_workers, - compile_encoder=args.compile_encoder, # NEW - random_state=random_state, - learning_rate=args.learning_rate, - weight_decay=args.weight_decay, - cache_dir=cache_dir, - crop_scale_bounds=(0.75, 0.85) # slightly increased to compensate for 424-400px crop when saving webdataset - # cache_dir='/tmp/cache' - # /tmp for ramdisk (400GB total, vs 4TB total for nvme) - ) - - # https://discuss.pytorch.org/t/torch-dynamo-hit-config-cache-size-limit-64/183886 - # https://pytorch.org/docs/stable/torch.compiler_faq.html#why-is-compilation-slow - - wandb.finish() \ No newline at end of file diff --git a/only_for_me/narval/train.sh b/only_for_me/narval/train.sh deleted file mode 100644 index dc027b44..00000000 --- a/only_for_me/narval/train.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -#SBATCH --time=23:30:0 -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=10 -#SBATCH --mem-per-cpu 4G -#SBATCH --gres=gpu:v100:4 - -nvidia-smi - -PYTHON=/home/walml/envs/zoobot39_dev/bin/python -# source ~/envs/zoobot39_dev/bin/activate - -mkdir $SLURM_TMPDIR/cache -# mkdir /tmp/cache - -export NCCL_BLOCKING_WAIT=1 - -REPO_DIR=/project/def-bovy/walml/zoobot -# srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ -# --save-dir $REPO_DIR/only_for_me/narval/desi_300px_f128_1gpu \ -# --batch-size 256 \ -# --num-features 128 \ -# --gpus 1 \ -# --num-workers 10 \ -# --color --wandb --mixed-precision - -srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_tiny_rw_224_4gpu_terr_w5 \ - --batch-size 64 \ - --architecture maxvit_tiny_rw_224 \ - --terrestrial \ - --gpus 4 \ - --num-workers 10 \ - --weight-decay 0.05 \ - --color --wandb --mixed-precision --compile-encoder - - -# batch sizes -# v100 -# efficientnet_b0 256 -# maxvit_tiny_rw_224 64 -# tf_efficientnetv2_b0 256 - 50.55%, might squeeze x2 -# tf_efficientnetv2_s 64? TODO -# pit_xs_224 512 -# pit_s_224 256 -# maxvit_small_224 32 -# vit_small_patch16_224 32? 17%, too small. TODO 128 (but pure vit is probably not great) -# vit_tiny_patch16_224 64? -# maxvit_rmlp_small_rw_224 64 (97% allocated and very good efficiency) -# https://huggingface.co/timm/convnextv2_nano.fcmae TODO with MAE -# convnext_nano -# convnext_tiny - 128 -# efficientnet_b2 - 32% at 64, can do 128 -# convnext_small 64 - 49.25%, MAYBE 128 -# efficientnet_b4 - 48% at 64, could maybe do 128 -# efficientnet_b5 - 64. remember it expects bigger images tho, may not work great -# maxvit_rmlp_base_rw_224 - 32 (95%). Now scaling at 16 gpus - -srun $PYTHON $REPO_DIR/only_for_me/narval/train.py \ - --save-dir $REPO_DIR/only_for_me/narval/desi_300px_maxvit_rmlp_base_rw_224_4gpu_w005_terr \ - --batch-size 32 \ - --gpus 4 \ - --nodes 4 \ - --num-workers 5 \ - --weight-decay 0.05 \ - --architecture maxvit_rmlp_base_rw_224 \ - --color --wandb --mixed-precision \ - --compile-encoder --terrestrial - - # maxvit_small_tf_224 \ diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index d4efe3b5..c54bfe4a 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -22,6 +22,25 @@ import zoobot.pytorch.datasets.webdatamodule as webdatamodule +def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048): + for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: + n_shards = len(catalog) // divisor + logging.info(n_shards) + + catalog = catalog[:n_shards*divisor] + logging.info(len(catalog)) + + # wds_dir e.g. /home/walml/data/wds + + save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically + + df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=False) + # some tests, if you like + # webdataset_utils.load_wds_directly(save_loc) + # webdataset_utils.load_wds_with_augmentation(save_loc) + # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) + + def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: int): counter = 0 shards = [os.path.join(save_dir, f'mock_shard_{shard_n}_{shard_size}.tar') for shard_n in range(n_shards)] @@ -97,6 +116,7 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): + assert os.path.isfile(galaxy['file_loc']), galaxy['file_loc'] im = cv2.imread(galaxy['file_loc']) # cv2 loads BGR for 'history', fix im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) From 10478f7a7f23beb357ca704370187a02cce89c14 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Dec 2023 13:30:55 -0500 Subject: [PATCH 211/307] add per-q and per-campaign logging via schema metadata --- zoobot/pytorch/estimators/define_model.py | 58 +++++++++++++++++-- .../training/train_with_pytorch_lightning.py | 5 +- zoobot/shared/schemas.py | 2 +- 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b4d784b2..ebb1450f 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -8,6 +8,7 @@ from torchmetrics import Accuracy import timm +from zoobot.shared import schemas from zoobot.pytorch.estimators import efficientnet_custom, custom_layers from zoobot.pytorch.training import losses @@ -142,7 +143,13 @@ class ZoobotTree(GenericLightningModule): def __init__( self, output_dim: int, - question_index_groups: List, + # in the simplest case, this is all zoobot needs: grouping of label col indices as questions + question_index_groups: List=None, + # BUT + # if you pass these, it enables better per-question and per-survey logging (because we have names) + # must be passed as simple dicts, not objects, so can't just pass schema in + question_answer_pairs: dict=None, + dependencies: dict=None, # encoder args architecture_name="efficientnet_b0", channels=1, @@ -162,8 +169,11 @@ def __init__( # now, finally, can pass only standard variables as hparams to save # will still need to actually use these variables later, this super init only saves them super().__init__( + # these all do nothing, they are simply saved by lightning as hparams output_dim, question_index_groups, + question_answer_pairs, + dependencies, architecture_name, channels, timm_kwargs, @@ -178,6 +188,14 @@ def __init__( logging.info('Generic __init__ complete - moving to Zoobot __init__') + if question_answer_pairs is not None: + logging.info('question_index_groups/dependencies passed to Zoobot, constructing schema in __init__') + assert question_index_groups is None, "Don't pass both question_index_groups and question_answer_pairs/dependencies" + assert dependencies is not None + self.schema = schemas.Schema(question_answer_pairs, dependencies) + # replace with schema-derived version + question_index_groups = self.schema.question_index_groups + # set attributes for learning rate, betas, used by self.configure_optimizers() # TODO refactor to optimizer params self.learning_rate = learning_rate @@ -253,7 +271,7 @@ def log_outputs(self, outputs, step_name): if outputs['predictions'].shape[1] == 2: # will only do for binary classifications self.log( "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, sync_dist=True) - # pass + def log_loss_per_question(self, multiq_loss, prefix): @@ -261,9 +279,39 @@ def log_loss_per_question(self, multiq_loss, prefix): # TODO need schema attribute or similar to have access to question names, this will do for now # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop - for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) - # pass + + if hasattr(self, 'schema'): + # use schema metadata to log intelligently + # will have schema if question_answer_pairs and dependencies are passed to __init__ + + # assume that questions are named like smooth-or-featured-CAMPAIGN + for question_n, question in enumerate(self.schema.questions): + self.log( + f'{prefix}/epoch_questions/loss_{question.text}', + torch.mean(multiq_loss[:, question_n]), + on_epoch=True, + on_step=False, + sync_dist=True + ) + + campaigns = [question.text.split('-')[-1] for question in self.schema.questions] + for campaign in campaigns: + campaign_questions = [q for q in self.schema.questions if campaign in q.text] + campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] + self.log( + f'{prefix}/epoch_campaigns/loss_{campaign}', + torch.mean(multiq_loss[:, campaign_q_indices]), + on_epoch=True, + on_step=False, + sync_dist=True + ) + + else: + # fallback to logging with question_n + for question_n in range(multiq_loss.shape[1]): + self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) + + diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index d5cab9cb..6e6b45f8 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -257,7 +257,10 @@ def train_default_zoobot_from_scratch( # these args are automatically logged lightning_model = define_model.ZoobotTree( output_dim=len(schema.label_cols), - question_index_groups=schema.question_index_groups, + # question_index_groups=schema.question_index_groups, + # NEW - pass these from schema, for better logging + question_answer_pairs=schema.question_answer_pairs, + dependencies=schema.dependencies, architecture_name=architecture_name, channels=channels, # use_imagenet_weights=False, diff --git a/zoobot/shared/schemas.py b/zoobot/shared/schemas.py index 8d32f878..88a6c3bf 100755 --- a/zoobot/shared/schemas.py +++ b/zoobot/shared/schemas.py @@ -130,7 +130,7 @@ def set_dependencies(questions, dependencies): class Schema(): - def __init__(self, question_answer_pairs:dict, dependencies): + def __init__(self, question_answer_pairs:dict, dependencies: dict): """ Relate the df label columns tor question/answer groups and to tfrecod label indices Requires that labels be continguous by question - easily satisfied From cfe7694eda0cf686572bdafac0632228ecfa9ed6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Dec 2023 17:41:28 -0500 Subject: [PATCH 212/307] overwrite --- zoobot/pytorch/datasets/webdataset_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index c54bfe4a..9ffbd2e5 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -22,7 +22,7 @@ import zoobot.pytorch.datasets.webdatamodule as webdatamodule -def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048): +def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048, overwrite=False): for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: n_shards = len(catalog) // divisor logging.info(n_shards) @@ -34,7 +34,7 @@ def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, tes save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically - df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=False) + df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=overwrite) # some tests, if you like # webdataset_utils.load_wds_directly(save_loc) # webdataset_utils.load_wds_with_augmentation(save_loc) @@ -107,6 +107,8 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse if sparse_label_df is not None: shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge + assert not any(shard_df[label_cols].isna().max()) + # logging.info(shard_save_loc) sink = wds.TarWriter(shard_save_loc) for _, galaxy in shard_df.iterrows(): @@ -120,6 +122,7 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): im = cv2.imread(galaxy['file_loc']) # cv2 loads BGR for 'history', fix im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + assert not np.any(np.isnan(np.array(im))), galaxy['file_loc'] # if central_crop is not None: # width, height, _ = im.shape # # assert width == height, (width, height) From f17e89f49b8b0973ac0a7b2303c622b5d066155f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Dec 2023 19:45:02 -0500 Subject: [PATCH 213/307] temporarily force double --- zoobot/pytorch/datasets/webdatamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 09c949a5..5203e248 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -90,7 +90,7 @@ def do_transform(img): def make_label_transform(self): if self.label_cols is not None: def label_transform(label_dict): - return torch.from_numpy(np.array([label_dict.get(col, 0) for col in self.label_cols])) + return torch.from_numpy(np.array([label_dict.get(col, 0) for col in self.label_cols])).double() return label_transform else: return identity # do nothing From f16f773c3354b5a553e31fc0b7a5d4913032d39d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Dec 2023 19:49:37 -0500 Subject: [PATCH 214/307] force int labels --- zoobot/pytorch/datasets/webdataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 9ffbd2e5..d4be76e0 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -136,7 +136,7 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): if transform is not None: im = transform(image=im)['image'] - labels = json.dumps(galaxy[label_cols].to_dict()) + labels = json.dumps(galaxy[label_cols].astype(np.int32).to_dict()) id_str = str(galaxy['id_str']) return { "__key__": id_str, From 61615b9714022bef064e387ed4dd242449b988e2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 09:36:59 -0500 Subject: [PATCH 215/307] log q/campaign losses with only rel. galaxies --- zoobot/pytorch/estimators/define_model.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index ebb1450f..3cd5bf05 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -286,9 +286,12 @@ def log_loss_per_question(self, multiq_loss, prefix): # assume that questions are named like smooth-or-featured-CAMPAIGN for question_n, question in enumerate(self.schema.questions): + # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros + # could sum, but then this would vary with batch size + nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-8 self.log( f'{prefix}/epoch_questions/loss_{question.text}', - torch.mean(multiq_loss[:, question_n]), + torch.mean(multiq_loss[nontrivial_loss_mask, question_n]), on_epoch=True, on_step=False, sync_dist=True @@ -297,10 +300,14 @@ def log_loss_per_question(self, multiq_loss, prefix): campaigns = [question.text.split('-')[-1] for question in self.schema.questions] for campaign in campaigns: campaign_questions = [q for q in self.schema.questions if campaign in q.text] - campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] + campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) + + # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss + nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-8 # shape batch size + self.log( f'{prefix}/epoch_campaigns/loss_{campaign}', - torch.mean(multiq_loss[:, campaign_q_indices]), + torch.mean(multiq_loss[nontrivial_loss_mask][:, campaign_q_indices]), on_epoch=True, on_step=False, sync_dist=True From 8d28297f827abe0ebc9e147c3dbfe4f14886b009 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 10:12:26 -0500 Subject: [PATCH 216/307] change floor --- zoobot/pytorch/estimators/define_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 3cd5bf05..403c4303 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -288,7 +288,7 @@ def log_loss_per_question(self, multiq_loss, prefix): for question_n, question in enumerate(self.schema.questions): # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros # could sum, but then this would vary with batch size - nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-8 + nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-3 # 'zero' seems to be ~5e-5 floor in practice self.log( f'{prefix}/epoch_questions/loss_{question.text}', torch.mean(multiq_loss[nontrivial_loss_mask, question_n]), @@ -303,7 +303,7 @@ def log_loss_per_question(self, multiq_loss, prefix): campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss - nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-8 # shape batch size + nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-3 # shape batch size self.log( f'{prefix}/epoch_campaigns/loss_{campaign}', From c95dc821c79d206efcded9b2378753a5b2bdb8d3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 11:11:44 -0500 Subject: [PATCH 217/307] .int() for DirichletMultionial --- zoobot/pytorch/estimators/define_model.py | 14 ++++++++++---- zoobot/pytorch/training/losses.py | 6 +++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 403c4303..5a1f00ba 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -234,7 +234,6 @@ def __init__( def calculate_and_log_loss(self, predictions, labels, step_name): # self.loss_func returns shape of (galaxy, question), mean to () multiq_loss = self.loss_func(predictions, labels, sum_over_questions=False) - # if hasattr(self, 'schema'): self.log_loss_per_question(multiq_loss, prefix=step_name) # sum over questions and take a per-device mean # for DDP strategy, batch size is constant (batches are not divided, data pool is divided) @@ -283,12 +282,11 @@ def log_loss_per_question(self, multiq_loss, prefix): if hasattr(self, 'schema'): # use schema metadata to log intelligently # will have schema if question_answer_pairs and dependencies are passed to __init__ - # assume that questions are named like smooth-or-featured-CAMPAIGN for question_n, question in enumerate(self.schema.questions): # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros # could sum, but then this would vary with batch size - nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-3 # 'zero' seems to be ~5e-5 floor in practice + nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-5 # 'zero' seems to be ~5e-5 floor in practice self.log( f'{prefix}/epoch_questions/loss_{question.text}', torch.mean(multiq_loss[nontrivial_loss_mask, question_n]), @@ -296,6 +294,14 @@ def log_loss_per_question(self, multiq_loss, prefix): on_step=False, sync_dist=True ) + self.log( + f'{prefix}/epoch_question_masks/loss_{question.text}_mask', + torch.mean(nontrivial_loss_mask.float()), + on_epoch=True, + on_step=False, + sync_dist=True + ) + campaigns = [question.text.split('-')[-1] for question in self.schema.questions] for campaign in campaigns: @@ -303,7 +309,7 @@ def log_loss_per_question(self, multiq_loss, prefix): campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss - nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-3 # shape batch size + nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-5 # shape batch size self.log( f'{prefix}/epoch_campaigns/loss_{campaign}', diff --git a/zoobot/pytorch/training/losses.py b/zoobot/pytorch/training/losses.py index 39c521c9..46b44e4f 100755 --- a/zoobot/pytorch/training/losses.py +++ b/zoobot/pytorch/training/losses.py @@ -54,7 +54,6 @@ def dirichlet_loss(labels_for_q, concentrations_for_q): # you will get image batches of shape [N/4, 64, 64, 1] and hence have the wrong number of images vs. labels (and meaningless images) # so check --shard-img-size carefully! total_count = torch.sum(labels_for_q, axis=1) - # logging.info(total_count) # pytorch dirichlet multinomial implementation will not accept zero total votes, need to handle separately return get_dirichlet_neg_log_prob(labels_for_q, total_count, concentrations_for_q) @@ -105,5 +104,6 @@ def dirichlet_loss(labels_for_q, concentrations_for_q): def get_dirichlet_neg_log_prob(labels_for_q, total_count, concentrations_for_q): # https://docs.pyro.ai/en/stable/distributions.html#dirichletmultinomial - dist = pyro.distributions.DirichletMultinomial(total_count=total_count, concentration=concentrations_for_q, is_sparse=False) - return -dist.log_prob(labels_for_q) # important minus sign + # .int()s avoid rounding errors causing loss of around 1e-5 for questions with 0 votes + dist = pyro.distributions.DirichletMultinomial(total_count=total_count.int(), concentration=concentrations_for_q, is_sparse=False, validate_args=False) + return -dist.log_prob(labels_for_q.int()) # important minus sign From 43607f6323a4147ae54fe46ffb5114028b775a83 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 15:04:49 -0500 Subject: [PATCH 218/307] completely rework logging --- zoobot/pytorch/estimators/define_model.py | 127 ++++++++++++++-------- 1 file changed, 82 insertions(+), 45 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 5a1f00ba..b33afc3e 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -5,7 +5,7 @@ import torch from torch import nn import pytorch_lightning as pl -from torchmetrics import Accuracy +import torchmetrics import timm from zoobot.shared import schemas @@ -56,15 +56,39 @@ def __init__( ): super().__init__() self.save_hyperparameters() # saves all args by default - self.setup_metrics() def setup_metrics(self): # these are ignored unless output dim = 2 - self.train_accuracy = Accuracy(task='binary') - self.val_accuracy = Accuracy(task='binary') - # self.log_on_step = False - # self.log_on_step is useful for debugging, but slower - best when log_every_n_steps is fairly large + self.accuracy_metrics = torchmetrics.MetricCollection({ + 'train/accuracy': torchmetrics.Accuracy(task='binary'), + 'validation/accuracy': torchmetrics.Accuracy(task='binary'), + }) + + self.val_accuracy = torchmetrics.Accuracy(task='binary') + + self.loss_metrics = torchmetrics.MetricCollection({ + 'train/loss': torchmetrics.MeanMetric(nan_strategy='error'), + 'validation/loss': torchmetrics.MeanMetric(nan_strategy='error'), + }) + + # TODO handle when schema doesn't exist + question_metric_dict = {} + for step_name in ['train', 'validation']: + question_metric_dict.update({ + step_name + '/question_loss/' + question.text: torchmetrics.MeanMetric(nan_strategy='ignore') + for question in self.schema.questions + }) + self.question_loss_metrics = torchmetrics.MetricCollection(question_metric_dict) + + campaigns = schema_to_campaigns(self.schema) + campaign_metric_dict = {} + for step_name in ['train', 'validation']: + campaign_metric_dict.update({ + step_name + '/campaign_loss/' + campaign: torchmetrics.MeanMetric(nan_strategy='ignore') + for campaign in campaigns + }) + self.campaign_loss_metrics = torchmetrics.MetricCollection(campaign_metric_dict) def forward(self, x): @@ -87,22 +111,45 @@ def training_step(self, batch, batch_idx): return self.make_step(batch, step_name='train') def on_train_batch_end(self, outputs, *args): - self.log_outputs(outputs, step_name='train') + self.update_metrics(outputs, step_name='train') def validation_step(self, batch, batch_idx): return self.make_step(batch, step_name='validation') def on_validation_batch_end(self, outputs, *args): - self.log_outputs(outputs, step_name='validation') - - def log_outputs(self, outputs, step_name): - raise NotImplementedError('Must be subclassed') + self.update_metrics(outputs, step_name='validation') def test_step(self, batch, batch_idx): return self.make_step(batch, step_name='test') def on_test_batch_end(self, outputs, *args): - self.log_outputs(outputs, step_name='test') + self.update_metrics(outputs, step_name='test') + + def on_train_epoch_end(self) -> None: + self.log_all_metrics(step_name='train') + + def on_validation_epoch_end(self) -> None: + self.log_all_metrics(step_name='validation') + + def update_metrics(self, outputs, step_name): + raise NotImplementedError('Must be subclassed') + + def log_all_metrics(self, step_name): + + self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) + self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) + self.log_dict(self.campaign_loss_metrics, on_step=False, on_epoch=True, logger=True) + + if hasattr(self, 'accuracy_metrics'): + self.log_dict( + self.accuracy_metrics, + on_epoch=True, + on_step=False, + prog_bar=True, + logger=True + ) + + def predict_step(self, batch, batch_idx, dataloader_idx=0): @@ -196,6 +243,8 @@ def __init__( # replace with schema-derived version question_index_groups = self.schema.question_index_groups + self.setup_metrics() + # set attributes for learning rate, betas, used by self.configure_optimizers() # TODO refactor to optimizer params self.learning_rate = learning_rate @@ -259,17 +308,17 @@ def configure_optimizers(self): min_lr=1e-6, patience=self.scheduler_params.get('patience', 5) ) - return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'validation/epoch_loss'} + return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'validation/loss'} else: logging.info('No scheduler used') return optimizer # no scheduler - def log_outputs(self, outputs, step_name): - self.log("{}/epoch_loss".format(step_name), outputs['loss'], on_epoch=True, on_step=False,prog_bar=True, logger=True, sync_dist=True) - if outputs['predictions'].shape[1] == 2: # will only do for binary classifications - self.log( - "{}_accuracy".format(step_name), self.train_accuracy(outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), prog_bar=True, sync_dist=True) + def update_metrics(self, outputs, step_name): + self.loss_metrics[step_name + '/loss'](outputs['loss']) + + if outputs['predictions'].shape[1] == 2: + self.accuracy_metrics[step_name + '/accuracy'](outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), @@ -286,43 +335,26 @@ def log_loss_per_question(self, multiq_loss, prefix): for question_n, question in enumerate(self.schema.questions): # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros # could sum, but then this would vary with batch size - nontrivial_loss_mask = multiq_loss[:, question_n] > 1e-5 # 'zero' seems to be ~5e-5 floor in practice - self.log( - f'{prefix}/epoch_questions/loss_{question.text}', - torch.mean(multiq_loss[nontrivial_loss_mask, question_n]), - on_epoch=True, - on_step=False, - sync_dist=True - ) - self.log( - f'{prefix}/epoch_question_masks/loss_{question.text}_mask', - torch.mean(nontrivial_loss_mask.float()), - on_epoch=True, - on_step=False, - sync_dist=True - ) - - - campaigns = [question.text.split('-')[-1] for question in self.schema.questions] + nontrivial_loss_mask = multiq_loss[:, question_n] > 0 # 'zero' seems to be ~5e-5 floor in practice + + this_question_metric = self.question_loss_metrics[prefix + '/question_loss/' + question.text] + this_question_metric(torch.mean(multiq_loss[nontrivial_loss_mask, question_n])) + + campaigns = schema_to_campaigns(self.schema) for campaign in campaigns: campaign_questions = [q for q in self.schema.questions if campaign in q.text] campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss - nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 1e-5 # shape batch size + nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 0 # shape batch size - self.log( - f'{prefix}/epoch_campaigns/loss_{campaign}', - torch.mean(multiq_loss[nontrivial_loss_mask][:, campaign_q_indices]), - on_epoch=True, - on_step=False, - sync_dist=True - ) + this_campaign_metric = self.campaign_loss_metrics[prefix + '/campaign_loss/' + campaign] + this_campaign_metric(torch.mean(multiq_loss[nontrivial_loss_mask][:, campaign_q_indices])) else: # fallback to logging with question_n for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/epoch_questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) + self.log(f'{prefix}/questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) @@ -431,6 +463,11 @@ def get_pytorch_dirichlet_head(encoder_dim: int, output_dim: int, test_time_drop return nn.Sequential(*modules_to_use) +def schema_to_campaigns(schema): + # e.g. [gz2, dr12, ...] + return [question.text.split('-')[-1] for question in schema.questions] + + # class ToyEncoder(pl.LightningModule): # def __init__(self): From eb8bd9a39d22c86bfbd7384acaa270d1f27d9366 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 15:48:25 -0500 Subject: [PATCH 219/307] typo --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 6e6b45f8..f826c940 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -291,7 +291,7 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) - early_stopping_callback = EarlyStopping(monitor='validation/epoch_loss', patience=patience, check_finite=True) + early_stopping_callback = EarlyStopping(monitor='validation/loss', patience=patience, check_finite=True) callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks # callbacks = None From 554826bc7817184c1d7d14b0a37253383d86a17d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 15:57:56 -0500 Subject: [PATCH 220/307] typo --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index f826c940..c4a622fa 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -277,10 +277,12 @@ def train_default_zoobot_from_scratch( extra_callbacks = extra_callbacks if extra_callbacks else [] + monitor_metric = 'validation/loss' + # used later for checkpoint_callback.best_model_path checkpoint_callback = ModelCheckpoint( dirpath=os.path.join(save_dir, 'checkpoints'), - monitor="validation/epoch_loss", + monitor=monitor_metric, save_weights_only=True, mode='min', # custom filename for checkpointing due to / in metric @@ -291,7 +293,7 @@ def train_default_zoobot_from_scratch( save_top_k=save_top_k ) - early_stopping_callback = EarlyStopping(monitor='validation/loss', patience=patience, check_finite=True) + early_stopping_callback = EarlyStopping(monitor=monitor_metric, patience=patience, check_finite=True) callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks # callbacks = None From e8aa6b6b4723af12486b459bc31b5b216e885ddc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 20 Dec 2023 16:55:30 -0500 Subject: [PATCH 221/307] continuing metric rework --- zoobot/pytorch/estimators/define_model.py | 102 +++++++++--------- .../training/train_with_pytorch_lightning.py | 1 + 2 files changed, 54 insertions(+), 49 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b33afc3e..b2f3bcc9 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -60,26 +60,26 @@ def __init__( def setup_metrics(self): # these are ignored unless output dim = 2 - self.accuracy_metrics = torchmetrics.MetricCollection({ + self.accuracy_metrics = torch.nn.ModuleDict({ 'train/accuracy': torchmetrics.Accuracy(task='binary'), 'validation/accuracy': torchmetrics.Accuracy(task='binary'), }) self.val_accuracy = torchmetrics.Accuracy(task='binary') - self.loss_metrics = torchmetrics.MetricCollection({ + self.loss_metrics = torch.nn.ModuleDict({ 'train/loss': torchmetrics.MeanMetric(nan_strategy='error'), 'validation/loss': torchmetrics.MeanMetric(nan_strategy='error'), }) # TODO handle when schema doesn't exist question_metric_dict = {} - for step_name in ['train', 'validation']: + for step_name in ['train', 'validation']: # TODO test question_metric_dict.update({ step_name + '/question_loss/' + question.text: torchmetrics.MeanMetric(nan_strategy='ignore') for question in self.schema.questions }) - self.question_loss_metrics = torchmetrics.MetricCollection(question_metric_dict) + self.question_loss_metrics = torch.nn.ModuleDict(question_metric_dict) campaigns = schema_to_campaigns(self.schema) campaign_metric_dict = {} @@ -88,7 +88,7 @@ def setup_metrics(self): step_name + '/campaign_loss/' + campaign: torchmetrics.MeanMetric(nan_strategy='ignore') for campaign in campaigns }) - self.campaign_loss_metrics = torchmetrics.MetricCollection(campaign_metric_dict) + self.campaign_loss_metrics = torch.nn.ModuleDict(campaign_metric_dict) def forward(self, x): @@ -98,11 +98,10 @@ def forward(self, x): def make_step(self, batch, step_name): x, labels = batch predictions = self(x) # by default, these are Dirichlet concentrations - loss = self.calculate_and_log_loss(predictions, labels, step_name) - return {'loss': loss, 'predictions': predictions, 'labels': labels} - - def calculate_and_log_loss(self, predictions, labels, step_name): - raise NotImplementedError('Must be subclassed') + loss = self.calculate_loss_and_update_loss_metrics(predictions, labels, step_name) + outputs = {'loss': loss, 'predictions': predictions, 'labels': labels} + self.update_other_metrics(outputs, step_name=step_name) + return outputs def configure_optimizers(self): raise NotImplementedError('Must be subclassed') @@ -110,31 +109,36 @@ def configure_optimizers(self): def training_step(self, batch, batch_idx): return self.make_step(batch, step_name='train') - def on_train_batch_end(self, outputs, *args): - self.update_metrics(outputs, step_name='train') - def validation_step(self, batch, batch_idx): return self.make_step(batch, step_name='validation') - - def on_validation_batch_end(self, outputs, *args): - self.update_metrics(outputs, step_name='validation') - + def test_step(self, batch, batch_idx): return self.make_step(batch, step_name='test') - def on_test_batch_end(self, outputs, *args): - self.update_metrics(outputs, step_name='test') + # def on_train_batch_end(self, outputs, *args): + # pass + + # def on_validation_batch_end(self, outputs, *args): + # pass def on_train_epoch_end(self) -> None: - self.log_all_metrics(step_name='train') + # called *after* on_validation_epoch_end, confusingly + # do NOT log_all_metrics here. + # logging a metric resets it, and on_validation_epoch_end just logged and reset everything, so you will only log nans + pass def on_validation_epoch_end(self) -> None: - self.log_all_metrics(step_name='validation') + # raise ValueError('val epoch end') + # called at end of val epoch, but BEFORE on_train_epoch_end + self.log_all_metrics() # logs all metrics, so can do in val only - def update_metrics(self, outputs, step_name): + def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): + raise NotImplementedError('Must be subclassed') + + def update_other_metrics(self, outputs, step_name): raise NotImplementedError('Must be subclassed') - def log_all_metrics(self, step_name): + def log_all_metrics(self): self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) @@ -280,14 +284,15 @@ def __init__( logging.info('Zoobot __init__ complete') - def calculate_and_log_loss(self, predictions, labels, step_name): + def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): # self.loss_func returns shape of (galaxy, question), mean to () multiq_loss = self.loss_func(predictions, labels, sum_over_questions=False) - self.log_loss_per_question(multiq_loss, prefix=step_name) + self.update_per_question_loss_metric(multiq_loss, step_name=step_name) # sum over questions and take a per-device mean # for DDP strategy, batch size is constant (batches are not divided, data pool is divided) # so this will be the global per-example mean loss = torch.mean(torch.sum(multiq_loss, axis=1)) + self.loss_metrics[step_name + '/loss'](loss) return loss @@ -314,48 +319,47 @@ def configure_optimizers(self): return optimizer # no scheduler - def update_metrics(self, outputs, step_name): - self.loss_metrics[step_name + '/loss'](outputs['loss']) + def update_other_metrics(self, outputs, step_name): if outputs['predictions'].shape[1] == 2: self.accuracy_metrics[step_name + '/accuracy'](outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), - - def log_loss_per_question(self, multiq_loss, prefix): + def update_per_question_loss_metric(self, multiq_loss, step_name): # log questions individually # TODO need schema attribute or similar to have access to question names, this will do for now # unlike Finetuneable..., does not use TorchMetrics, simply logs directly # TODO could use TorchMetrics and for q in schema, self.q_metric loop - if hasattr(self, 'schema'): + # if hasattr(self, 'schema'): # use schema metadata to log intelligently # will have schema if question_answer_pairs and dependencies are passed to __init__ # assume that questions are named like smooth-or-featured-CAMPAIGN - for question_n, question in enumerate(self.schema.questions): - # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros - # could sum, but then this would vary with batch size - nontrivial_loss_mask = multiq_loss[:, question_n] > 0 # 'zero' seems to be ~5e-5 floor in practice + for question_n, question in enumerate(self.schema.questions): + # for logging comparison, want to ignore loss on unlablled examples, i.e. take mean ignoring zeros + # could sum, but then this would vary with batch size + nontrivial_loss_mask = multiq_loss[:, question_n] > 0 # 'zero' seems to be ~5e-5 floor in practice - this_question_metric = self.question_loss_metrics[prefix + '/question_loss/' + question.text] - this_question_metric(torch.mean(multiq_loss[nontrivial_loss_mask, question_n])) + this_question_metric = self.question_loss_metrics[step_name + '/question_loss/' + question.text] + # raise ValueError + this_question_metric(torch.mean(multiq_loss[nontrivial_loss_mask, question_n])) - campaigns = schema_to_campaigns(self.schema) - for campaign in campaigns: - campaign_questions = [q for q in self.schema.questions if campaign in q.text] - campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) + campaigns = schema_to_campaigns(self.schema) + for campaign in campaigns: + campaign_questions = [q for q in self.schema.questions if campaign in q.text] + campaign_q_indices = [self.schema.questions.index(q) for q in campaign_questions] # shape (num q in this campaign e.g. 10) - # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss - nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 0 # shape batch size + # similarly to per-question, only include in mean if (any) q in this campaign has a non-trivial loss + nontrivial_loss_mask = multiq_loss[:, campaign_q_indices].sum(axis=1) > 0 # shape batch size - this_campaign_metric = self.campaign_loss_metrics[prefix + '/campaign_loss/' + campaign] - this_campaign_metric(torch.mean(multiq_loss[nontrivial_loss_mask][:, campaign_q_indices])) + this_campaign_metric = self.campaign_loss_metrics[step_name + '/campaign_loss/' + campaign] + this_campaign_metric(torch.mean(multiq_loss[nontrivial_loss_mask][:, campaign_q_indices])) - else: - # fallback to logging with question_n - for question_n in range(multiq_loss.shape[1]): - self.log(f'{prefix}/questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) - + # else: + # # fallback to logging with question_n + # for question_n in range(multiq_loss.shape[1]): + # self.log(f'{step_name}/questions/question_{question_n}_loss:0', torch.mean(multiq_loss[:, question_n]), on_epoch=True, on_step=False, sync_dist=True) + diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index c4a622fa..2832687e 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -298,6 +298,7 @@ def train_default_zoobot_from_scratch( # callbacks = None trainer = pl.Trainer( + num_sanity_val_steps=0, log_every_n_steps=150, accelerator=accelerator, devices=devices, # per node From d94150e6e5e80b21894ccd215012ee1f937f30d7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 25 Dec 2023 16:33:01 -0500 Subject: [PATCH 222/307] tweaks for foundation --- zoobot/pytorch/datasets/webdatamodule.py | 48 ++++++++++++--------- zoobot/pytorch/datasets/webdataset_utils.py | 28 ++++++------ zoobot/pytorch/estimators/define_model.py | 19 -------- 3 files changed, 41 insertions(+), 54 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 5203e248..da4f2eb9 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -87,14 +87,6 @@ def do_transform(img): return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) return do_transform - def make_label_transform(self): - if self.label_cols is not None: - def label_transform(label_dict): - return torch.from_numpy(np.array([label_dict.get(col, 0) for col in self.label_cols])).double() - return label_transform - else: - return identity # do nothing - def make_loader(self, urls, mode="train"): logging.info('Making loader with mode {}'.format(mode)) @@ -108,7 +100,7 @@ def make_loader(self, urls, mode="train"): transform_image = self.make_image_transform(mode=mode) - transform_label = self.make_label_transform() + transform_label = dict_to_label_cols_factory(self.label_cols) dataset = wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) # https://webdataset.github.io/webdataset/multinode/ @@ -138,17 +130,6 @@ def make_loader(self, urls, mode="train"): # so use the torch collate instead dataset = dataset.batched(self.batch_size, torch.utils.data.default_collate, partial=False) - loader = wds.WebLoader( - dataset, - batch_size=None, # already batched - shuffle=False, # already shuffled - num_workers=self.num_workers, - pin_memory=True, - prefetch_factor=self.prefetch_factor - ) - - loader.length = dataset_size // self.batch_size - # temp hack instead if mode in ['train', 'val']: assert dataset_size % self.batch_size == 0, (dataset_size, self.batch_size, dataset_size % self.batch_size) @@ -159,6 +140,8 @@ def make_loader(self, urls, mode="train"): # loader = loader.ddp_equalize(dataset_size // self.batch_size) # print("# loader length", len(loader)) + loader = webdataset_to_webloader(dataset, self.num_workers, self.prefetch_factor) + return loader def train_dataloader(self): @@ -197,4 +180,27 @@ def get_first(x): def custom_collate(x): if isinstance(x, list) and len(x) == 1: x = x[0] - return torch.utils.data.default_collate(x) \ No newline at end of file + return torch.utils.data.default_collate(x) + + +def webdataset_to_webloader(dataset, num_workers, prefetch_factor): + loader = wds.WebLoader( + dataset, + batch_size=None, # already batched + shuffle=False, # already shuffled + num_workers=num_workers, + pin_memory=True, + prefetch_factor=prefetch_factor + ) + + # loader.length = dataset_size // batch_size + return loader + + +def dict_to_label_cols_factory(label_cols=None): + if label_cols is not None: + def label_transform(label_dict): + return torch.from_numpy(np.array([label_dict.get(col, 0) for col in label_cols])).double() # gets cast to int in zoobot loss + return label_transform + else: + return identity # do nothing diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index d4be76e0..3c63360c 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -22,23 +22,23 @@ import zoobot.pytorch.datasets.webdatamodule as webdatamodule -def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048, overwrite=False): - for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: - n_shards = len(catalog) // divisor - logging.info(n_shards) +# def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048, overwrite=False): +# for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: +# n_shards = len(catalog) // divisor +# logging.info(n_shards) - catalog = catalog[:n_shards*divisor] - logging.info(len(catalog)) +# catalog = catalog[:n_shards*divisor] +# logging.info(len(catalog)) - # wds_dir e.g. /home/walml/data/wds +# # wds_dir e.g. /home/walml/data/wds - save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically +# save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically - df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=overwrite) - # some tests, if you like - # webdataset_utils.load_wds_directly(save_loc) - # webdataset_utils.load_wds_with_augmentation(save_loc) - # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) +# df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=overwrite) +# # some tests, if you like +# # webdataset_utils.load_wds_directly(save_loc) +# # webdataset_utils.load_wds_with_augmentation(save_loc) +# # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: int): @@ -136,7 +136,7 @@ def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): if transform is not None: im = transform(image=im)['image'] - labels = json.dumps(galaxy[label_cols].astype(np.int32).to_dict()) + labels = json.dumps(galaxy[label_cols].to_dict()) id_str = str(galaxy['id_str']) return { "__key__": id_str, diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b2f3bcc9..dbfe3e4d 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -470,22 +470,3 @@ def get_pytorch_dirichlet_head(encoder_dim: int, output_dim: int, test_time_drop def schema_to_campaigns(schema): # e.g. [gz2, dr12, ...] return [question.text.split('-')[-1] for question in schema.questions] - - -# class ToyEncoder(pl.LightningModule): - -# def __init__(self): -# super(ToyEncoder, self).__init__() - -# self.conv1 = nn.Conv2d(3, 6, 5) -# self.pool = nn.MaxPool2d(2, 2) -# self.conv2 = nn.Conv2d(6, 16, 5) -# # pool again -# self.fc1 = nn.Linear(16 * 5 * 5, 1280) # dim 1280, like effnetb0 - -# def forward(self, x): -# x = self.pool(nn.functional.relu(self.conv1(x))) -# x = self.pool(nn.functional.relu(self.conv2(x))) -# x = x.view(-1, 16 * 5 * 5) -# x = nn.functional.relu(self.fc1(x)) -# return x From d109603fb8bd050d6b14bdf5296ba6a411743707 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 2 Jan 2024 17:35:31 -0500 Subject: [PATCH 223/307] small changes for new models --- zoobot/pytorch/datasets/webdatamodule.py | 3 +- zoobot/pytorch/datasets/webdataset_utils.py | 46 +++++------ zoobot/pytorch/estimators/define_model.py | 15 +++- zoobot/pytorch/training/finetune.py | 88 +++++++++++---------- zoobot/shared/schemas.py | 2 + 5 files changed, 87 insertions(+), 67 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index da4f2eb9..075f4201 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -159,7 +159,7 @@ def identity(x): def nodesplitter_func(urls): urls_to_use = list(wds.split_by_node(urls)) # rely on WDS for the hard work rank, world_size, worker, num_workers = wds.utils.pytorch_worker_info() - logging.info( + logging.debug( f''' Splitting urls within webdatamodule with WORLD_SIZE: {world_size}, RANK: {rank}, WORKER: {worker} of {num_workers}\n @@ -169,6 +169,7 @@ def nodesplitter_func(urls): return urls_to_use def interpret_shard_size_from_url(url): + assert isinstance(url, str), TypeError(url) return int(url.rstrip('.tar').split('_')[-1]) def interpret_dataset_size_from_urls(urls): diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 3c63360c..3c6c9d46 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -1,5 +1,5 @@ import logging -from typing import List +from typing import Union, Callable import os import cv2 import json @@ -41,7 +41,7 @@ # # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) -def make_mock_wds(save_dir: str, label_cols: List, n_shards: int, shard_size: int): +def make_mock_wds(save_dir: str, label_cols: list, n_shards: int, shard_size: int): counter = 0 shards = [os.path.join(save_dir, f'mock_shard_{shard_n}_{shard_size}.tar') for shard_n in range(n_shards)] for shard in shards: @@ -103,47 +103,49 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse for shard_n, shard_df in tqdm.tqdm(enumerate(shard_dfs), total=len(shard_dfs)): shard_save_loc = save_loc.replace('.tar', f'_{shard_n}_{len(shard_df)}.tar') if overwrite or not(os.path.isfile(shard_save_loc)): - if sparse_label_df is not None: - shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # auto-merge + shard_df = pd.merge(shard_df, sparse_label_df, how='left', validate='one_to_one', suffixes=('', '_badlabelmerge')) # type: ignore # auto-merge - assert not any(shard_df[label_cols].isna().max()) + assert not any(shard_df[label_cols].isna().max()) # type: ignore # logging.info(shard_save_loc) sink = wds.TarWriter(shard_save_loc) - for _, galaxy in shard_df.iterrows(): + for _, galaxy in shard_df.iterrows(): # type: ignore sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) sink.close() -def galaxy_to_wds(galaxy: pd.Series, label_cols, transform=None): +def galaxy_to_wds(galaxy: pd.Series, label_cols: Union[list[str],None]=None, metadata_cols: Union[list, None]=None, transform: Union[Callable, None]=None): assert os.path.isfile(galaxy['file_loc']), galaxy['file_loc'] im = cv2.imread(galaxy['file_loc']) # cv2 loads BGR for 'history', fix im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) assert not np.any(np.isnan(np.array(im))), galaxy['file_loc'] - # if central_crop is not None: - # width, height, _ = im.shape - # # assert width == height, (width, height) - # mid = int(width/2) - # half_central_crop = int(central_crop/2) - # low_edge, high_edge = mid - half_central_crop, mid + half_central_crop - # im = im[low_edge:high_edge, low_edge:high_edge] - # assert im.shape == (central_crop, central_crop, 3) - - # apply albumentations - if transform is not None: - im = transform(image=im)['image'] - labels = json.dumps(galaxy[label_cols].to_dict()) id_str = str(galaxy['id_str']) + + if transform is not None: + im = transform(image=im)['image'] + + if label_cols is None: + labels = json.dumps({}) + else: + labels = json.dumps(galaxy[label_cols].to_dict()) + + if metadata_cols is None: + metadata = json.dumps({}) + else: + metadata = json.dumps(galaxy[metadata_cols].to_dict()) + return { - "__key__": id_str, + "__key__": id_str, # silly wds bug where if __key__ ends .jpg, all keys get jpg. prepended?! use id_str instead "image.jpg": im, - "labels.json": labels + "labels.json": labels, + "metadata.json": metadata } + # just for debugging def load_wds_directly(wds_loc, max_to_load=3): diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index dbfe3e4d..7a67193b 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -267,7 +267,7 @@ def __init__( self.encoder = torch.compile(self.encoder) # bit lazy assuming 224 input size - self.encoder_dim = get_encoder_dim(self.encoder, input_size=224, channels=channels) + self.encoder_dim = get_encoder_dim(self.encoder) # typically encoder_dim=1280 for effnetb0 logging.info('encoder dim: {}'.format(self.encoder_dim)) @@ -385,9 +385,16 @@ def dirichlet_loss(preds, labels, question_index_groups, sum_over_questions=Fals return multiq_loss -def get_encoder_dim(encoder, input_size, channels): - x = torch.randn(1, channels, input_size, input_size) # batch size of 1 - return encoder(x).shape[-1] +# input_size doesn't matter as long as it's large enough to not be pooled to zero +# channels doesn't matter at all +def get_encoder_dim(encoder): + try: + x = torch.randn(1, 3, 224, 224) # BCHW + return encoder(x).shape[-1] + except RuntimeError: # tensor might not be on same device as model, just try the only other option + x = torch.randn(1, 3, 224, 224).to('cuda') + return encoder(x).shape[-1] + def get_pytorch_encoder( diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 09272084..f68e05a7 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -2,7 +2,7 @@ # https://github.com/inigoval/finetune/blob/main/finetune.py import logging import os -from typing import Any +from typing import Any, Union import warnings from functools import partial @@ -13,6 +13,7 @@ import torch import torch.nn.functional as F import torchmetrics as tm +import timm from zoobot.pytorch.training import losses from zoobot.pytorch.estimators import define_model @@ -59,11 +60,10 @@ class FinetuneableZoobotAbstract(pl.LightningModule): def __init__( self, - # can provide either checkpoint_loc, and will load this model as encoder... - checkpoint_loc=None, - # ...or directly pass model to use as encoder + # can provide either zoobot_checkpoint_loc, and will load this model as encoder... + zoobot_checkpoint_loc=None, + # ...or directly pass any model to use as encoder (if you do this, you will need to keep it around for later) encoder=None, - encoder_dim=1280, # as per current Zooot. TODO Could get automatically? n_epochs=100, # TODO early stopping n_blocks=0, # how many layers deep to FT lr_decay=0.75, @@ -81,22 +81,24 @@ def __init__( # adds every __init__ arg to model.hparams # will also add to wandb if using logging=wandb, I think # necessary if you want to reload! - with warnings.catch_warnings(): - warnings.simplefilter("ignore") + # with warnings.catch_warnings(): + # warnings.simplefilter("ignore") # this raises a warning that encoder is already a Module hence saved in checkpoint hence no need to save as hparam # true - except we need it to instantiate this class, so it's really handy to have saved as well # therefore ignore the warning - self.save_hyperparameters() + self.save_hyperparameters(ignore=['encoder']) # never serialise the encoder, way too heavy + # if you need the encoder to recreate, pass when loading checkpoint e.g. + # FinetuneableZoobotTree.load_from_checkpoint(loc, encoder=encoder) - if checkpoint_loc is not None: + if zoobot_checkpoint_loc is not None: assert encoder is None, 'Cannot pass both checkpoint to load and encoder to use' - self.encoder = load_pretrained_encoder(checkpoint_loc) + self.encoder = load_pretrained_zoobot(zoobot_checkpoint_loc) else: - assert checkpoint_loc is None, 'Cannot pass both checkpoint to load and encoder to use' + assert zoobot_checkpoint_loc is None, 'Cannot pass both checkpoint to load and encoder to use' assert encoder is not None, 'Must pass either checkpoint to load or encoder to use' self.encoder = encoder - self.encoder_dim = encoder_dim + self.encoder_dim = define_model.get_encoder_dim(self.encoder) self.n_blocks = n_blocks # for backwards compat. @@ -128,31 +130,37 @@ def configure_optimizers(self): lr = self.learning_rate params = [{"params": self.head.parameters(), "lr": lr}] - if hasattr(self.encoder, 'blocks'): - logging.info('Effnet detected') - # TODO this actually excludes the first conv layer/bn - encoder_blocks = self.encoder.blocks - blocks_to_tune = list(encoder_blocks) - elif hasattr(self.encoder, 'layer4'): - logging.info('Resnet detected') - # similarly, excludes first conv/bn + # architecture = self.encoder.default_config['architecture'] + logging.info(f'Encoder architecture to finetune: {type(self.encoder)}') + + # if 'efficientnet' in architecture: + if isinstance(self.encoder, timm.models.EfficientNet): + # TODO for now, these count as separate layers, not ideal + early_tuneable_layers = [self.encoder.conv_stem, self.encoder.bn1] + encoder_blocks = list(self.encoder.blocks) + blocks_to_tune = early_tuneable_layers + encoder_blocks + elif isinstance(self.encoder, timm.models.ResNet): + # all timm resnets seem to have this structure blocks_to_tune = [ + # similarly + self.encoder.conv1, + self.encoder.bn1, self.encoder.layer1, self.encoder.layer2, self.encoder.layer3, self.encoder.layer4 ] - elif hasattr(self.encoder, 'stages'): - logging.info('Max-ViT Tiny detected') - blocks_to_tune = [ - # getattr as obj.0 is not allowed (why does timm call them 0!?) - getattr(self.encoder.stages, '0'), - getattr(self.encoder.stages, '1'), - getattr(self.encoder.stages, '2'), - getattr(self.encoder.stages, '3'), - ] + elif isinstance(self.encoder, timm.models.MaxxVit): + blocks_to_tune = self.encoder.stem + [stage for stage in self.encoder.stages] + # [ + # getattr as obj.0 is not allowed (why does timm call them 0!?) + # getattr(self.encoder.stages, '0'), + # getattr(self.encoder.stages, '1'), + # getattr(self.encoder.stages, '2'), + # getattr(self.encoder.stages, '3'), + # ] else: - raise ValueError('Encoder architecture not automatically recognised') + raise ValueError(f'Encoder architecture not automatically recognised: {type(self.encoder)}') assert self.n_blocks <= len( blocks_to_tune @@ -239,7 +247,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): on_epoch=True ) - def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): + def on_validation_batch_end(self, outputs: dict, batch, batch_idx: int, dataloader_idx=0): self.val_loss_metric(outputs['loss']) self.log( "finetuning/val_loss", @@ -252,7 +260,7 @@ def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx if self.visualize_images: self.upload_images_to_wandb(outputs, batch, batch_idx) - def on_test_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx=0): + def on_test_batch_end(self, outputs: dict, batch, batch_idx: int, dataloader_idx=0): self.test_loss_metric(outputs['loss']) self.log( "finetuning/test_loss", @@ -319,7 +327,7 @@ def __init__( self.test_acc = tm.Accuracy(task=task, average="micro", num_classes=num_classes) def step_to_dict(self, y, y_pred, loss): - y_class_preds = torch.argmax(y_pred, axis=1) + y_class_preds = torch.argmax(y_pred, axis=1) # type: ignore return {'loss': loss.mean(), 'predictions': y_pred, 'labels': y, 'class_predictions': y_class_preds} def on_train_batch_end(self, step_output, *args): @@ -359,11 +367,11 @@ def on_test_batch_end(self, step_output, *args) -> None: ) - def predict_step(self, x, batch_idx): + def predict_step(self, x: Union[list[torch.Tensor], torch.Tensor], batch_idx): # see Abstract version if isinstance(x, list) and len(x) == 1: return self(x[0]) - x = self.forward(x) # logits from LinearClassifier + x = self.forward(x) # type: ignore # logits from LinearClassifier # then applies softmax return F.softmax(x, dim=1) @@ -377,7 +385,7 @@ def upload_images_to_wandb(self, outputs, batch, batch_idx): images = [img for img in x[:n_images]] captions = [f'Ground Truth: {y_i} \nPrediction: {y_p_i}' for y_i, y_p_i in zip( y[:n_images], y_pred_softmax[:n_images])] - self.logger.log_image( + self.logger.log_image( # type: ignore key='val_images', images=images, caption=captions) @@ -462,20 +470,20 @@ def configure_optimizers(self): return torch.optim.AdamW(head_params + encoder_params, lr=self.learning_rate) -def load_pretrained_encoder(checkpoint_loc: str) -> torch.nn.Sequential: +def load_pretrained_zoobot(checkpoint_loc: str) -> torch.nn.Module: """ Args: - checkpoint_loc (str): path to saved LightningModule checkpoint, likely of :class:`ZoobotTree`, :class:`FinetuneableZoobotClassifier`, or :class:`FinetunabelZoobotTree`. Must have .encoder attribute. + checkpoint_loc (str): path to saved LightningModule checkpoint, likely of :class:`ZoobotTree`, :class:`FinetuneableZoobotClassifier`, or :class:`FinetunabelZoobotTree`. Must have .zoobot attribute. Returns: - torch.nn.Sequential: pretrained PyTorch encoder within that LightningModule. + torch.nn.Module: pretrained PyTorch encoder within that LightningModule. """ if torch.cuda.is_available(): map_location = None else: # necessary to load gpu-trained model on cpu map_location = torch.device('cpu') - return define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc, map_location=map_location).encoder + return define_model.ZoobotTree.load_from_checkpoint(checkpoint_loc, map_location=map_location).encoder # type: ignore def get_trainer( diff --git a/zoobot/shared/schemas.py b/zoobot/shared/schemas.py index 88a6c3bf..da761349 100755 --- a/zoobot/shared/schemas.py +++ b/zoobot/shared/schemas.py @@ -278,3 +278,5 @@ def answers(self): # so don't log anything during Schema.__init__! gz_evo_v1_schema = Schema(label_metadata.gz_evo_v1_pairs, label_metadata.gz_evo_v1_dependencies) + +gz_jwst_schema = Schema(label_metadata.jwst_ortho_pairs, label_metadata.jwst_ortho_dependencies) From d99d193dba38e733f047834a989801fd83b07ad5 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Jan 2024 16:39:51 -0500 Subject: [PATCH 224/307] small ssl tweaks --- zoobot/pytorch/estimators/define_model.py | 28 +++---------------- zoobot/pytorch/training/finetune.py | 2 +- zoobot/pytorch/training/losses.py | 1 - .../training/train_with_pytorch_lightning.py | 2 +- 4 files changed, 6 insertions(+), 27 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 7a67193b..6ba2d1d0 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -59,17 +59,11 @@ def __init__( def setup_metrics(self): - # these are ignored unless output dim = 2 - self.accuracy_metrics = torch.nn.ModuleDict({ - 'train/accuracy': torchmetrics.Accuracy(task='binary'), - 'validation/accuracy': torchmetrics.Accuracy(task='binary'), - }) - self.val_accuracy = torchmetrics.Accuracy(task='binary') self.loss_metrics = torch.nn.ModuleDict({ - 'train/loss': torchmetrics.MeanMetric(nan_strategy='error'), - 'validation/loss': torchmetrics.MeanMetric(nan_strategy='error'), + 'train/supervised_loss': torchmetrics.MeanMetric(nan_strategy='error'), + 'validation/supervised_loss': torchmetrics.MeanMetric(nan_strategy='error'), }) # TODO handle when schema doesn't exist @@ -100,7 +94,7 @@ def make_step(self, batch, step_name): predictions = self(x) # by default, these are Dirichlet concentrations loss = self.calculate_loss_and_update_loss_metrics(predictions, labels, step_name) outputs = {'loss': loss, 'predictions': predictions, 'labels': labels} - self.update_other_metrics(outputs, step_name=step_name) + # self.update_other_metrics(outputs, step_name=step_name) return outputs def configure_optimizers(self): @@ -144,15 +138,6 @@ def log_all_metrics(self): self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) self.log_dict(self.campaign_loss_metrics, on_step=False, on_epoch=True, logger=True) - if hasattr(self, 'accuracy_metrics'): - self.log_dict( - self.accuracy_metrics, - on_epoch=True, - on_step=False, - prog_bar=True, - logger=True - ) - @@ -292,7 +277,7 @@ def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name) # for DDP strategy, batch size is constant (batches are not divided, data pool is divided) # so this will be the global per-example mean loss = torch.mean(torch.sum(multiq_loss, axis=1)) - self.loss_metrics[step_name + '/loss'](loss) + self.loss_metrics[step_name + '/supervised_loss'](loss) return loss @@ -319,11 +304,6 @@ def configure_optimizers(self): return optimizer # no scheduler - def update_other_metrics(self, outputs, step_name): - - if outputs['predictions'].shape[1] == 2: - self.accuracy_metrics[step_name + '/accuracy'](outputs['predictions'], torch.argmax(outputs['labels'], dim=1, keepdim=False)), - def update_per_question_loss_metric(self, multiq_loss, step_name): # log questions individually diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index f68e05a7..f0851623 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -52,7 +52,7 @@ class FinetuneableZoobotAbstract(pl.LightningModule): weight_decay (float, optional): AdamW weight decay arg (i.e. L2 penalty). Defaults to 0.05. learning_rate (float, optional): AdamW learning rate arg. Defaults to 1e-4. dropout_prob (float, optional): P of dropout before final output layer. Defaults to 0.5. - freeze_batchnorm (bool, optional): If True, do not update batchnorm stats during finetuning. Defaults to True. + always_train_batchnorm (bool, optional): If True, do not update batchnorm stats during finetuning. Defaults to True. prog_bar (bool, optional): Print progress bar during finetuning. Defaults to True. visualize_images (bool, optional): Upload example images to WandB. Good for debugging but slow. Defaults to False. seed (int, optional): random seed to use. Defaults to 42. diff --git a/zoobot/pytorch/training/losses.py b/zoobot/pytorch/training/losses.py index 46b44e4f..b3c74029 100755 --- a/zoobot/pytorch/training/losses.py +++ b/zoobot/pytorch/training/losses.py @@ -26,7 +26,6 @@ def calculate_multiquestion_loss(labels: torch.Tensor, predictions: torch.Tensor q_indices = question_index_groups[q_n] q_start = q_indices[0] q_end = q_indices[1] - q_loss = dirichlet_loss(labels[:, q_start:q_end+1], predictions[:, q_start:q_end+1]) q_losses.append(q_loss) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 2832687e..ce3e0529 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -277,7 +277,7 @@ def train_default_zoobot_from_scratch( extra_callbacks = extra_callbacks if extra_callbacks else [] - monitor_metric = 'validation/loss' + monitor_metric = 'validation/supervised_loss' # used later for checkpoint_callback.best_model_path checkpoint_callback = ModelCheckpoint( From ee608cfa1718442bf6bcd35ca628949a6cb49194 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 4 Jan 2024 15:17:22 -0500 Subject: [PATCH 225/307] ukidss schema --- zoobot/shared/schemas.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zoobot/shared/schemas.py b/zoobot/shared/schemas.py index 88a6c3bf..cd15e6ce 100755 --- a/zoobot/shared/schemas.py +++ b/zoobot/shared/schemas.py @@ -278,3 +278,5 @@ def answers(self): # so don't log anything during Schema.__init__! gz_evo_v1_schema = Schema(label_metadata.gz_evo_v1_pairs, label_metadata.gz_evo_v1_dependencies) + +gz_ukidss_schema = Schema(label_metadata.ukidss_ortho_pairs, label_metadata.ukidss_ortho_dependencies) From 8d151675bf48989e009ecc5929f08b32d16662df Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 4 Jan 2024 15:41:48 -0500 Subject: [PATCH 226/307] ssl changes --- zoobot/pytorch/datasets/webdatamodule.py | 17 +++++++++--- .../training/train_with_pytorch_lightning.py | 26 ++++++++++++++++--- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 075f4201..2968bd2b 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -1,5 +1,5 @@ import os -import types +from typing import Callable import logging import torch.utils.data import numpy as np @@ -27,7 +27,8 @@ def __init__( color=False, crop_scale_bounds=(0.7, 0.8), crop_ratio_bounds=(0.9, 1.1), - resize_after_crop=224 + resize_after_crop=224, + transform: Callable=None ): super().__init__() @@ -60,6 +61,8 @@ def __init__( self.crop_scale_bounds = crop_scale_bounds self.crop_ratio_bounds = crop_ratio_bounds + self.transform = transform + for url_name in ['train', 'val', 'test', 'predict']: urls = getattr(self, f'{url_name}_urls') if urls is not None: @@ -98,7 +101,12 @@ def make_loader(self, urls, mode="train"): assert mode in ['val', 'test', 'predict'], mode shuffle = 0 - transform_image = self.make_image_transform(mode=mode) + if self.transform is None: + logging.info('Using default transform') + transform_image = self.make_image_transform(mode=mode) + else: + logging.info('Ignoring hparams and using directly-passed transform') + transform_image = self.transform transform_label = dict_to_label_cols_factory(self.label_cols) @@ -109,7 +117,8 @@ def make_loader(self, urls, mode="train"): if shuffle > 0: dataset = dataset.shuffle(shuffle) - dataset = dataset.decode("rgb") + # dataset = dataset.decode("rgb") # np.array, for albumentations + dataset = dataset.decode("pilrgb") # PIL Image, for torchvision if mode == 'predict': if self.label_cols != ['id_str']: diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index ce3e0529..da734dec 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -235,6 +235,23 @@ def train_default_zoobot_from_scratch( ) else: # this branch will use WebDataModule to load premade webdatasets + + # temporary: use SSL-like transform + from foundation.models import transforms + # from omegaconf import DictConfig + # cfg = DictConfig({ + # 'aug': { + # 'global_transform_0': { + # 'interpolation': 'bilinear', + # 'random_affine': {} # etc + # } + + # } + # }) + cfg = transforms.default_view_config() + cfg.output_size = resize_after_crop + transform = transforms.GalaxyViewTransform(cfg) + datamodule = webdatamodule.WebDataModule( train_urls=train_urls, val_urls=val_urls, @@ -246,10 +263,11 @@ def train_default_zoobot_from_scratch( prefetch_factor=prefetch_factor, cache_dir=cache_dir, # augmentation args - color=color, - crop_scale_bounds=crop_scale_bounds, - crop_ratio_bounds=crop_ratio_bounds, - resize_after_crop=resize_after_crop + transform=transform, + # color=color, + # crop_scale_bounds=crop_scale_bounds, + # crop_ratio_bounds=crop_ratio_bounds, + # resize_after_crop=resize_after_crop ) datamodule.setup(stage='fit') From 862ee374c8a74d5b059d9ba7915f0334fd546537 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 5 Jan 2024 10:36:50 -0500 Subject: [PATCH 227/307] torchrgb --- zoobot/pytorch/datasets/webdatamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 2968bd2b..0581dd11 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -118,7 +118,7 @@ def make_loader(self, urls, mode="train"): dataset = dataset.shuffle(shuffle) # dataset = dataset.decode("rgb") # np.array, for albumentations - dataset = dataset.decode("pilrgb") # PIL Image, for torchvision + dataset = dataset.decode("torchrgb") # tensor, for torchvision if mode == 'predict': if self.label_cols != ['id_str']: From 32c4f8b0b44d13291179e3c795cbeb688b14f363 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 8 Jan 2024 14:07:23 -0500 Subject: [PATCH 228/307] needs merge --- zoobot/pytorch/datasets/webdatamodule.py | 13 ++-- zoobot/pytorch/datasets/webdataset_utils.py | 60 +++++++++---------- .../training/train_with_pytorch_lightning.py | 10 ++-- 3 files changed, 42 insertions(+), 41 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 2968bd2b..4dfdc65e 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -28,7 +28,8 @@ def __init__( crop_scale_bounds=(0.7, 0.8), crop_ratio_bounds=(0.9, 1.1), resize_after_crop=224, - transform: Callable=None + train_transform: Callable=None, + inference_transform: Callable=None ): super().__init__() @@ -61,7 +62,8 @@ def __init__( self.crop_scale_bounds = crop_scale_bounds self.crop_ratio_bounds = crop_ratio_bounds - self.transform = transform + self.train_transform = train_transform + self.inference_transform = inference_transform for url_name in ['train', 'val', 'test', 'predict']: urls = getattr(self, f'{url_name}_urls') @@ -101,12 +103,12 @@ def make_loader(self, urls, mode="train"): assert mode in ['val', 'test', 'predict'], mode shuffle = 0 - if self.transform is None: + if self.train_transform is None: logging.info('Using default transform') transform_image = self.make_image_transform(mode=mode) else: - logging.info('Ignoring hparams and using directly-passed transform') - transform_image = self.transform + logging.info('Ignoring hparams and using directly-passed transforms') + transform_image = self.train_transform if mode == 'train' else self.inference_transform transform_label = dict_to_label_cols_factory(self.label_cols) @@ -130,6 +132,7 @@ def make_loader(self, urls, mode="train"): logging.info('Will return id_str only') dataset = dataset.to_tuple('__key__') else: + dataset = ( dataset.to_tuple('image.jpg', 'labels.json') .map_tuple(transform_image, transform_label) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 3c6c9d46..5f4ca157 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -21,42 +21,19 @@ import zoobot.pytorch.datasets.webdatamodule as webdatamodule +def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048, overwrite=False): + for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: + n_shards = len(catalog) // divisor + logging.info(n_shards) -# def catalogs_to_webdataset(dataset_name, wds_dir, label_cols, train_catalog, test_catalog, sparse_label_df=None, divisor=2048, overwrite=False): -# for (catalog_name, catalog) in [('train', train_catalog), ('test', test_catalog)]: -# n_shards = len(catalog) // divisor -# logging.info(n_shards) + catalog = catalog[:n_shards*divisor] + logging.info(len(catalog)) -# catalog = catalog[:n_shards*divisor] -# logging.info(len(catalog)) - -# # wds_dir e.g. /home/walml/data/wds - -# save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically + save_loc = f"{wds_dir}/{dataset_name}/{dataset_name}_{catalog_name}.tar" # .tar replace automatically -# df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=overwrite) -# # some tests, if you like -# # webdataset_utils.load_wds_directly(save_loc) -# # webdataset_utils.load_wds_with_augmentation(save_loc) -# # webdataset_utils.load_wds_with_webdatamodule([save_loc], label_cols) - - -def make_mock_wds(save_dir: str, label_cols: list, n_shards: int, shard_size: int): - counter = 0 - shards = [os.path.join(save_dir, f'mock_shard_{shard_n}_{shard_size}.tar') for shard_n in range(n_shards)] - for shard in shards: - sink = wds.TarWriter(shard) - for galaxy_n in range(shard_size): - data = { - "__key__": f'id_{galaxy_n}', - "image.jpg": (np.random.rand(424, 424)*255.).astype(np.uint8), - "labels.json": json.dumps(dict(zip(label_cols, [np.random.randint(low=0, high=10) for _ in range(len(label_cols))]))) - } - sink.write(data) - counter += 1 - print(counter) - return shards + df_to_wds(catalog, label_cols, save_loc, n_shards=n_shards, sparse_label_df=sparse_label_df, overwrite=overwrite) + def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse_label_df=None, overwrite=False): @@ -206,6 +183,25 @@ def identity(x): # no lambda to be pickleable return x + + +def make_mock_wds(save_dir: str, label_cols: list, n_shards: int, shard_size: int): + counter = 0 + shards = [os.path.join(save_dir, f'mock_shard_{shard_n}_{shard_size}.tar') for shard_n in range(n_shards)] + for shard in shards: + sink = wds.TarWriter(shard) + for galaxy_n in range(shard_size): + data = { + "__key__": f'id_{galaxy_n}', + "image.jpg": (np.random.rand(424, 424)*255.).astype(np.uint8), + "labels.json": json.dumps(dict(zip(label_cols, [np.random.randint(low=0, high=10) for _ in range(len(label_cols))]))) + } + sink.write(data) + counter += 1 + print(counter) + return shards + + if __name__ == '__main__': save_dir = '/home/walml/repos/temp' diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index da734dec..9abc5225 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -248,9 +248,10 @@ def train_default_zoobot_from_scratch( # } # }) - cfg = transforms.default_view_config() - cfg.output_size = resize_after_crop - transform = transforms.GalaxyViewTransform(cfg) + train_transform_cfg = transforms.default_view_config() + inference_transform_cfg = transforms.minimal_view_config() + train_transform_cfg.output_size = resize_after_crop + inference_transform_cfg.output_size = resize_after_crop datamodule = webdatamodule.WebDataModule( train_urls=train_urls, @@ -263,7 +264,8 @@ def train_default_zoobot_from_scratch( prefetch_factor=prefetch_factor, cache_dir=cache_dir, # augmentation args - transform=transform, + train_transform=transforms.GalaxyViewTransform(train_transform_cfg), + inference_transform=transforms.GalaxyViewTransform(inference_transform_cfg), # color=color, # crop_scale_bounds=crop_scale_bounds, # crop_ratio_bounds=crop_ratio_bounds, From 80517d5c084d857ec71ed7022d41b98c4dc2798b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 11 Jan 2024 22:43:58 -0500 Subject: [PATCH 229/307] allow nan strat --- zoobot/pytorch/estimators/define_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 6ba2d1d0..6e958819 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -58,12 +58,12 @@ def __init__( self.save_hyperparameters() # saves all args by default - def setup_metrics(self): + def setup_metrics(self, nan_strategy='error'): # may sometimes want to ignore nan even in main metrics self.val_accuracy = torchmetrics.Accuracy(task='binary') self.loss_metrics = torch.nn.ModuleDict({ - 'train/supervised_loss': torchmetrics.MeanMetric(nan_strategy='error'), - 'validation/supervised_loss': torchmetrics.MeanMetric(nan_strategy='error'), + 'train/supervised_loss': torchmetrics.MeanMetric(nan_strategy=nan_strategy), + 'validation/supervised_loss': torchmetrics.MeanMetric(nan_strategy=nan_strategy), }) # TODO handle when schema doesn't exist From 23062a52d562a3890c5331296b7c68875a6b9a40 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 14 Jan 2024 20:32:20 -0500 Subject: [PATCH 230/307] debugging --- zoobot/pytorch/datasets/webdataset_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index 5f4ca157..a542c014 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -88,7 +88,11 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse # logging.info(shard_save_loc) sink = wds.TarWriter(shard_save_loc) for _, galaxy in shard_df.iterrows(): # type: ignore - sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) + try: + sink.write(galaxy_to_wds(galaxy, label_cols, transform=transform)) + except Exception as e: + logging.critical(galaxy) + raise(e) sink.close() From d3b4398cf873a80e55b55527f7702b434a1348dd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 10:22:31 -0500 Subject: [PATCH 231/307] update labeldict --- zoobot/pytorch/datasets/webdatamodule.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 4fc750b3..98ed14ce 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -217,3 +217,11 @@ def label_transform(label_dict): return label_transform else: return identity # do nothing + +def dict_to_filled_dict_factory(label_cols): + # might be a little slow, but very safe + def label_transform(label_dict: dict): + # modifies inplace with 0 iff key missing + [label_dict.setdefault(col, 0) for col in label_cols] + return label_dict + return label_transform \ No newline at end of file From 372bdb1d421ddf78668d7101426650db068d5119 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 12:57:51 -0500 Subject: [PATCH 232/307] revert datamodule --- .../training/train_with_pytorch_lightning.py | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 9abc5225..8caea736 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -237,21 +237,11 @@ def train_default_zoobot_from_scratch( # this branch will use WebDataModule to load premade webdatasets # temporary: use SSL-like transform - from foundation.models import transforms - # from omegaconf import DictConfig - # cfg = DictConfig({ - # 'aug': { - # 'global_transform_0': { - # 'interpolation': 'bilinear', - # 'random_affine': {} # etc - # } - - # } - # }) - train_transform_cfg = transforms.default_view_config() - inference_transform_cfg = transforms.minimal_view_config() - train_transform_cfg.output_size = resize_after_crop - inference_transform_cfg.output_size = resize_after_crop + # from foundation.models import transforms + # train_transform_cfg = transforms.default_view_config() + # inference_transform_cfg = transforms.minimal_view_config() + # train_transform_cfg.output_size = resize_after_crop + # inference_transform_cfg.output_size = resize_after_crop datamodule = webdatamodule.WebDataModule( train_urls=train_urls, @@ -264,12 +254,13 @@ def train_default_zoobot_from_scratch( prefetch_factor=prefetch_factor, cache_dir=cache_dir, # augmentation args - train_transform=transforms.GalaxyViewTransform(train_transform_cfg), - inference_transform=transforms.GalaxyViewTransform(inference_transform_cfg), - # color=color, - # crop_scale_bounds=crop_scale_bounds, - # crop_ratio_bounds=crop_ratio_bounds, - # resize_after_crop=resize_after_crop + color=color, + crop_scale_bounds=crop_scale_bounds, + crop_ratio_bounds=crop_ratio_bounds, + resize_after_crop=resize_after_crop, + # temporary: use SSL-like transform + # train_transform=transforms.GalaxyViewTransform(train_transform_cfg), + # inference_transform=transforms.GalaxyViewTransform(inference_transform_cfg), ) datamodule.setup(stage='fit') @@ -352,7 +343,7 @@ def train_default_zoobot_from_scratch( # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting - if test_catalog is not None: + if datamodule.test_dataloader is not None: logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') test_trainer.validate( model=lightning_model, From e3124b7036a7aef11bd4ac0648fb5fb30511d54f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 13:16:40 -0500 Subject: [PATCH 233/307] add test dataloader --- zoobot/pytorch/datasets/webdatamodule.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 98ed14ce..7943294c 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -161,6 +161,9 @@ def train_dataloader(self): def val_dataloader(self): return self.make_loader(self.val_urls, mode="val") + + def test_dataloader(self): + return self.make_loader(self.val_urls, mode="test") def predict_dataloader(self): return self.make_loader(self.predict_urls, mode="predict") From c0b2e41d286b0fd04e4f6ab26cb7deb171a7ff8e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 13:24:57 -0500 Subject: [PATCH 234/307] add test metrics --- zoobot/pytorch/estimators/define_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 6e958819..53331422 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -64,11 +64,12 @@ def setup_metrics(self, nan_strategy='error'): # may sometimes want to ignore n self.loss_metrics = torch.nn.ModuleDict({ 'train/supervised_loss': torchmetrics.MeanMetric(nan_strategy=nan_strategy), 'validation/supervised_loss': torchmetrics.MeanMetric(nan_strategy=nan_strategy), + 'test/supervised_loss': torchmetrics.MeanMetric(nan_strategy=nan_strategy), }) # TODO handle when schema doesn't exist question_metric_dict = {} - for step_name in ['train', 'validation']: # TODO test + for step_name in ['train', 'validation', 'test']: question_metric_dict.update({ step_name + '/question_loss/' + question.text: torchmetrics.MeanMetric(nan_strategy='ignore') for question in self.schema.questions @@ -77,7 +78,7 @@ def setup_metrics(self, nan_strategy='error'): # may sometimes want to ignore n campaigns = schema_to_campaigns(self.schema) campaign_metric_dict = {} - for step_name in ['train', 'validation']: + for step_name in ['train', 'validation', 'test']: campaign_metric_dict.update({ step_name + '/campaign_loss/' + campaign: torchmetrics.MeanMetric(nan_strategy='ignore') for campaign in campaigns From d7386be33a3c4c4210ae5351e2a187b4624b34fd Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 13:45:03 -0500 Subject: [PATCH 235/307] debug test metrics not appearing --- zoobot/pytorch/datasets/webdatamodule.py | 2 +- .../training/train_with_pytorch_lightning.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 7943294c..abbb32f1 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -163,7 +163,7 @@ def val_dataloader(self): return self.make_loader(self.val_urls, mode="val") def test_dataloader(self): - return self.make_loader(self.val_urls, mode="test") + return self.make_loader(self.test_urls, mode="test") def predict_dataloader(self): return self.make_loader(self.predict_urls, mode="predict") diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 8caea736..99c96f1e 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -345,16 +345,20 @@ def train_default_zoobot_from_scratch( # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting if datamodule.test_dataloader is not None: logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') - test_trainer.validate( - model=lightning_model, - datamodule=datamodule, - ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - ) + # test_trainer.validate( + # model=lightning_model, + # datamodule=datamodule, + # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + # ) + datamodule.setup(stage='test') + # temp + print(datamodule.test_urls) test_trainer.test( model=lightning_model, datamodule=datamodule, ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" ) + # TODO may need to remake on 1 gpu only # explicitly update the model weights to the best checkpoint before returning # (assumes only one checkpoint callback, very likely in practice) From 9a0027172225362c0ee02d78aef9c150bca190b0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 13:51:02 -0500 Subject: [PATCH 236/307] tweak logging --- zoobot/pytorch/estimators/define_model.py | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 53331422..b5ba1e57 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -120,12 +120,13 @@ def on_train_epoch_end(self) -> None: # called *after* on_validation_epoch_end, confusingly # do NOT log_all_metrics here. # logging a metric resets it, and on_validation_epoch_end just logged and reset everything, so you will only log nans - pass + self.log_all_metrics(subset='train') def on_validation_epoch_end(self) -> None: - # raise ValueError('val epoch end') - # called at end of val epoch, but BEFORE on_train_epoch_end - self.log_all_metrics() # logs all metrics, so can do in val only + self.log_all_metrics(subset='validation') + + def on_test_epoch_end(self) -> None: + self.log_all_metrics(subset='test') def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): raise NotImplementedError('Must be subclassed') @@ -133,11 +134,16 @@ def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name) def update_other_metrics(self, outputs, step_name): raise NotImplementedError('Must be subclassed') - def log_all_metrics(self): - - self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) - self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) - self.log_dict(self.campaign_loss_metrics, on_step=False, on_epoch=True, logger=True) + def log_all_metrics(self, subset=None): + if subset is not None: + for name, metric in self.loss_metrics.items(): + if subset in name: + print('logging', name) + self.log(name, metric, on_epoch=True, on_step=False, prog_bar=True, logger=True) + else: # just log everything + self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) + self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) + self.log_dict(self.campaign_loss_metrics, on_step=False, on_epoch=True, logger=True) From 600ed3ea35070144043dd2bf9d492cf16e02d9e1 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 14:47:20 -0500 Subject: [PATCH 237/307] seems to block on multi-g --- .../training/train_with_pytorch_lightning.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 99c96f1e..c90a386d 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -341,24 +341,27 @@ def train_default_zoobot_from_scratch( best_model_path = trainer.checkpoint_callback.best_model_path - # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. - # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting - if datamodule.test_dataloader is not None: - logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') - # test_trainer.validate( - # model=lightning_model, - # datamodule=datamodule, - # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - # ) - datamodule.setup(stage='test') - # temp - print(datamodule.test_urls) - test_trainer.test( - model=lightning_model, - datamodule=datamodule, - ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - ) - # TODO may need to remake on 1 gpu only + if test_trainer.is_global_zero: + # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. + # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting + if datamodule.test_dataloader is not None: + logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') + # test_trainer.validate( + # model=lightning_model, + # datamodule=datamodule, + # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + # ) + datamodule.setup(stage='test') + test_trainer.test( + model=lightning_model, + datamodule=datamodule, + ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + ) + # TODO may need to remake on 1 gpu only + else: + logging.info('No test dataloader found, skipping test metrics') + else: + logging.info('Not global zero, skipping test metrics') # explicitly update the model weights to the best checkpoint before returning # (assumes only one checkpoint callback, very likely in practice) From e0e56cd3dada3768f38d23811b5d30bf0e030f56 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 15:03:31 -0500 Subject: [PATCH 238/307] maybe it's the datamodule --- zoobot/pytorch/estimators/define_model.py | 2 ++ .../training/train_with_pytorch_lightning.py | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index b5ba1e57..30c3481a 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -126,7 +126,9 @@ def on_validation_epoch_end(self) -> None: self.log_all_metrics(subset='validation') def on_test_epoch_end(self) -> None: + logging.info('start test epoch end') self.log_all_metrics(subset='test') + logging.info('end test epoch end') def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): raise NotImplementedError('Must be subclassed') diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index c90a386d..c04bc8f2 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -351,10 +351,24 @@ def train_default_zoobot_from_scratch( # datamodule=datamodule, # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" # ) - datamodule.setup(stage='test') + test_datamodule = webdatamodule.WebDataModule( + train_urls=None, + val_urls=None, + test_urls=test_urls, + label_cols=schema.label_cols, + batch_size=batch_size, + num_workers=num_workers, + prefetch_factor=prefetch_factor, + cache_dir=cache_dir, + color=color, + crop_scale_bounds=crop_scale_bounds, + crop_ratio_bounds=crop_ratio_bounds, + resize_after_crop=resize_after_crop + ) + test_datamodule.setup(stage='test') test_trainer.test( model=lightning_model, - datamodule=datamodule, + datamodule=test_datamodule, ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" ) # TODO may need to remake on 1 gpu only From 329a88c5fd7aa0537ba89a0003e92123537169c0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 15:15:02 -0500 Subject: [PATCH 239/307] num_workers=1, move test_trainer --- .../training/train_with_pytorch_lightning.py | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index c04bc8f2..96b0e503 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -331,51 +331,50 @@ def train_default_zoobot_from_scratch( trainer.fit(lightning_model, datamodule) # uses batch size of datamodule - test_trainer = pl.Trainer( - accelerator=accelerator, - devices=1, - precision=precision, - logger=wandb_logger, - default_root_dir=save_dir - ) - best_model_path = trainer.checkpoint_callback.best_model_path - if test_trainer.is_global_zero: - # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. - # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting - if datamodule.test_dataloader is not None: - logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') - # test_trainer.validate( - # model=lightning_model, - # datamodule=datamodule, - # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - # ) - test_datamodule = webdatamodule.WebDataModule( - train_urls=None, - val_urls=None, - test_urls=test_urls, - label_cols=schema.label_cols, - batch_size=batch_size, - num_workers=num_workers, - prefetch_factor=prefetch_factor, - cache_dir=cache_dir, - color=color, - crop_scale_bounds=crop_scale_bounds, - crop_ratio_bounds=crop_ratio_bounds, - resize_after_crop=resize_after_crop - ) - test_datamodule.setup(stage='test') - test_trainer.test( - model=lightning_model, - datamodule=test_datamodule, - ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - ) - # TODO may need to remake on 1 gpu only + # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. + # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting + if datamodule.test_dataloader is not None: + logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') + # test_trainer.validate( + # model=lightning_model, + # datamodule=datamodule, + # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + # ) + test_trainer = pl.Trainer( + accelerator=accelerator, + devices=1, + precision=precision, + logger=wandb_logger, + default_root_dir=save_dir + ) + if test_trainer.is_global_zero: + test_datamodule = webdatamodule.WebDataModule( + train_urls=None, + val_urls=None, + test_urls=test_urls, + label_cols=schema.label_cols, + batch_size=batch_size, + num_workers=1, # 20 / 5 == 4, /2=2 + prefetch_factor=prefetch_factor, + cache_dir=None, + color=color, + crop_scale_bounds=crop_scale_bounds, + crop_ratio_bounds=crop_ratio_bounds, + resize_after_crop=resize_after_crop + ) + test_datamodule.setup(stage='test') + test_trainer.test( + model=lightning_model, + datamodule=test_datamodule, + ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + ) else: - logging.info('No test dataloader found, skipping test metrics') + logging.info('Not global zero, skipping test metrics') else: - logging.info('Not global zero, skipping test metrics') + logging.info('No test dataloader found, skipping test metrics') + # explicitly update the model weights to the best checkpoint before returning # (assumes only one checkpoint callback, very likely in practice) From de7562b7edccca982fcf43a2a19188c1070bd8cc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 15:28:08 -0500 Subject: [PATCH 240/307] check if it is logging or wds --- zoobot/pytorch/estimators/define_model.py | 4 +- .../training/train_with_pytorch_lightning.py | 78 +++++++++---------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 30c3481a..2524a116 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -127,7 +127,7 @@ def on_validation_epoch_end(self) -> None: def on_test_epoch_end(self) -> None: logging.info('start test epoch end') - self.log_all_metrics(subset='test') + # self.log_all_metrics(subset='test') logging.info('end test epoch end') def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): @@ -140,7 +140,7 @@ def log_all_metrics(self, subset=None): if subset is not None: for name, metric in self.loss_metrics.items(): if subset in name: - print('logging', name) + logging.info(name) self.log(name, metric, on_epoch=True, on_step=False, prog_bar=True, logger=True) else: # just log everything self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 96b0e503..6f464ab8 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -335,45 +335,45 @@ def train_default_zoobot_from_scratch( # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting - if datamodule.test_dataloader is not None: - logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') - # test_trainer.validate( - # model=lightning_model, - # datamodule=datamodule, - # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - # ) - test_trainer = pl.Trainer( - accelerator=accelerator, - devices=1, - precision=precision, - logger=wandb_logger, - default_root_dir=save_dir - ) - if test_trainer.is_global_zero: - test_datamodule = webdatamodule.WebDataModule( - train_urls=None, - val_urls=None, - test_urls=test_urls, - label_cols=schema.label_cols, - batch_size=batch_size, - num_workers=1, # 20 / 5 == 4, /2=2 - prefetch_factor=prefetch_factor, - cache_dir=None, - color=color, - crop_scale_bounds=crop_scale_bounds, - crop_ratio_bounds=crop_ratio_bounds, - resize_after_crop=resize_after_crop - ) - test_datamodule.setup(stage='test') - test_trainer.test( - model=lightning_model, - datamodule=test_datamodule, - ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - ) - else: - logging.info('Not global zero, skipping test metrics') - else: - logging.info('No test dataloader found, skipping test metrics') + # if datamodule.test_dataloader is not None: + # logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') + # # test_trainer.validate( + # # model=lightning_model, + # # datamodule=datamodule, + # # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + # # ) + # test_trainer = pl.Trainer( + # accelerator=accelerator, + # devices=1, + # precision=precision, + # logger=wandb_logger, + # default_root_dir=save_dir + # ) + # if test_trainer.is_global_zero: + # test_datamodule = webdatamodule.WebDataModule( + # train_urls=None, + # val_urls=None, + # test_urls=test_urls, + # label_cols=schema.label_cols, + # batch_size=batch_size, + # num_workers=1, # 20 / 5 == 4, /2=2 + # prefetch_factor=prefetch_factor, + # cache_dir=None, + # color=color, + # crop_scale_bounds=crop_scale_bounds, + # crop_ratio_bounds=crop_ratio_bounds, + # resize_after_crop=resize_after_crop + # ) + datamodule.setup(stage='test') + trainer.test( + model=lightning_model, + datamodule=datamodule, + ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + ) + # else: + # logging.info('Not global zero, skipping test metrics') + # else: + # logging.info('No test dataloader found, skipping test metrics') # explicitly update the model weights to the best checkpoint before returning From 2b387c65ceb7a12e52f46e5c3056570f291fc34a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 15:34:42 -0500 Subject: [PATCH 241/307] it ran. now logging back on --- zoobot/pytorch/estimators/define_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 2524a116..1cb6e237 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -127,7 +127,7 @@ def on_validation_epoch_end(self) -> None: def on_test_epoch_end(self) -> None: logging.info('start test epoch end') - # self.log_all_metrics(subset='test') + self.log_all_metrics(subset='test') logging.info('end test epoch end') def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): From 5c4c84ba0c0c99d431e35a0ea23834a5811a1502 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 21:01:40 -0500 Subject: [PATCH 242/307] oops, broke logging --- zoobot/pytorch/estimators/define_model.py | 13 +++++++------ .../training/train_with_pytorch_lightning.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 1cb6e237..79aa551a 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -126,9 +126,9 @@ def on_validation_epoch_end(self) -> None: self.log_all_metrics(subset='validation') def on_test_epoch_end(self) -> None: - logging.info('start test epoch end') + # logging.info('start test epoch end') self.log_all_metrics(subset='test') - logging.info('end test epoch end') + # logging.info('end test epoch end') def calculate_loss_and_update_loss_metrics(self, predictions, labels, step_name): raise NotImplementedError('Must be subclassed') @@ -138,10 +138,11 @@ def update_other_metrics(self, outputs, step_name): def log_all_metrics(self, subset=None): if subset is not None: - for name, metric in self.loss_metrics.items(): - if subset in name: - logging.info(name) - self.log(name, metric, on_epoch=True, on_step=False, prog_bar=True, logger=True) + for metric_collection in (self.loss_metrics, self.question_loss_metrics, self.campaign_loss_metrics): + for name, metric in metric_collection.items(): + if subset in name: + logging.info(name) + self.log(name, metric, on_epoch=True, on_step=False, prog_bar=True, logger=True) else: # just log everything self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 6f464ab8..c83acfdd 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -335,8 +335,8 @@ def train_default_zoobot_from_scratch( # can test as per the below, but note that datamodule must have a test dataset attribute as per pytorch lightning docs. # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting - # if datamodule.test_dataloader is not None: - # logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') + if datamodule.test_dataloader is not None: + logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') # # test_trainer.validate( # # model=lightning_model, # # datamodule=datamodule, @@ -364,12 +364,12 @@ def train_default_zoobot_from_scratch( # crop_ratio_bounds=crop_ratio_bounds, # resize_after_crop=resize_after_crop # ) - datamodule.setup(stage='test') - trainer.test( - model=lightning_model, - datamodule=datamodule, - ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - ) + datamodule.setup(stage='test') + trainer.test( + model=lightning_model, + datamodule=datamodule, + ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" + ) # else: # logging.info('Not global zero, skipping test metrics') # else: From 53974dac8b8a011fbbf249312192e24ea5bf7158 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 15 Jan 2024 21:19:27 -0500 Subject: [PATCH 243/307] fix prog bar --- zoobot/pytorch/estimators/define_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 79aa551a..3471d668 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -139,10 +139,11 @@ def update_other_metrics(self, outputs, step_name): def log_all_metrics(self, subset=None): if subset is not None: for metric_collection in (self.loss_metrics, self.question_loss_metrics, self.campaign_loss_metrics): + prog_bar = metric_collection == self.loss_metrics for name, metric in metric_collection.items(): if subset in name: logging.info(name) - self.log(name, metric, on_epoch=True, on_step=False, prog_bar=True, logger=True) + self.log(name, metric, on_epoch=True, on_step=False, prog_bar=prog_bar, logger=True) else: # just log everything self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) self.log_dict(self.question_loss_metrics, on_step=False, on_epoch=True, logger=True) From c8442dbe54c586e74e1e86f501cc8a8f81efd86f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 16 Jan 2024 10:16:37 -0500 Subject: [PATCH 244/307] try with aug fix --- zoobot/pytorch/datasets/webdatamodule.py | 20 +++++++++++++++++--- zoobot/pytorch/estimators/define_model.py | 3 ++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index abbb32f1..cc5d5800 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -1,4 +1,5 @@ import os +from collections import defaultdict from typing import Callable import logging import torch.utils.data @@ -89,6 +90,8 @@ def make_image_transform(self, mode="train"): def do_transform(img): + assert img.shape[2] < 4 # 1 or 3 channels in shape[2] dim, i.e. numpy/pil HWC convention + # if not, check decode mode is 'rgb' not 'torchrgb' return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) return do_transform @@ -105,11 +108,14 @@ def make_loader(self, urls, mode="train"): if self.train_transform is None: logging.info('Using default transform') + decode_mode = 'rgb' # np.array, for albumentations transform_image = self.make_image_transform(mode=mode) else: logging.info('Ignoring hparams and using directly-passed transforms') + decode_mode = 'torchrgb' # tensor, for torchvision transform_image = self.train_transform if mode == 'train' else self.inference_transform + transform_label = dict_to_label_cols_factory(self.label_cols) dataset = wds.WebDataset(urls, cache_dir=self.cache_dir, shardshuffle=shuffle>0, nodesplitter=nodesplitter_func) @@ -119,8 +125,7 @@ def make_loader(self, urls, mode="train"): if shuffle > 0: dataset = dataset.shuffle(shuffle) - # dataset = dataset.decode("rgb") # np.array, for albumentations - dataset = dataset.decode("torchrgb") # tensor, for torchvision + dataset = dataset.decode(decode_mode) if mode == 'predict': if self.label_cols != ['id_str']: @@ -222,9 +227,18 @@ def label_transform(label_dict): return identity # do nothing def dict_to_filled_dict_factory(label_cols): + logging.info(f'label cols: {label_cols}') # might be a little slow, but very safe def label_transform(label_dict: dict): + # modifies inplace with 0 iff key missing - [label_dict.setdefault(col, 0) for col in label_cols] + # [label_dict.setdefault(col, 0) for col in label_cols] + + for col in label_cols: + label_dict[col] = label_dict.get(col, 0) + + # label_dict_with_default = defaultdict(0) + # label_dict_with_default.update(label_dict) + return label_dict return label_transform \ No newline at end of file diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 3471d668..5fb8e3e3 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -87,6 +87,7 @@ def setup_metrics(self, nan_strategy='error'): # may sometimes want to ignore n def forward(self, x): + assert x.shape[1] < 4 # torchlike BCHW x = self.encoder(x) return self.head(x) @@ -142,7 +143,7 @@ def log_all_metrics(self, subset=None): prog_bar = metric_collection == self.loss_metrics for name, metric in metric_collection.items(): if subset in name: - logging.info(name) + # logging.info(name) self.log(name, metric, on_epoch=True, on_step=False, prog_bar=prog_bar, logger=True) else: # just log everything self.log_dict(self.loss_metrics, on_epoch=True, on_step=False, prog_bar=True, logger=True) From b65216555dc74ce76e5fbd1e3455ac4e8588c81f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 18 Jan 2024 20:41:02 -0500 Subject: [PATCH 245/307] train effnetv2xl on evo --- zoobot/pytorch/training/finetune.py | 44 ++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index f0851623..26462abc 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -15,6 +15,9 @@ import torchmetrics as tm import timm + +from foundation.models.mae_lightly import CustomMAEEncoder + from zoobot.pytorch.training import losses from zoobot.pytorch.estimators import define_model from zoobot.shared import schemas @@ -98,7 +101,12 @@ def __init__( assert encoder is not None, 'Must pass either checkpoint to load or encoder to use' self.encoder = encoder - self.encoder_dim = define_model.get_encoder_dim(self.encoder) + # TODO read as encoder property + if isinstance(self.encoder, CustomMAEEncoder): + self.encoder_dim = 256 # vit hidden dim, assuming average pool over seq dim + # self.encoder_dim = 9216 + else: + self.encoder_dim = define_model.get_encoder_dim(self.encoder) self.n_blocks = n_blocks # for backwards compat. @@ -125,15 +133,31 @@ def __init__( self.prog_bar = prog_bar self.visualize_images = visualize_images - def configure_optimizers(self): + def configure_optimizers(self): + + if isinstance(self.encoder, CustomMAEEncoder): + logging.info('Using custom optimizer for MAE encoder') + # equivalent to usual, but in param_group format + head_param_groups = [ + {'params': self.head.parameters(), + 'weight_decay': self.weight_decay, + 'lr_scale': 1. # no lr decay on head + } + ] + # now custom bit for the encoder + encoder_param_groups = self.encoder.get_param_groups(self.weight_decay, self.lr_decay) + n_param_groups_to_tune = self.n_blocks * 2 # finetune top N. First layer is pos embedding, then pairs of decay/no decay, 18 pairs by default + if n_param_groups_to_tune > len(encoder_param_groups): + logging.warning('more param groups (blocks*2) specified to finetune than available') + encoder_param_groups_to_tune = encoder_param_groups[-n_param_groups_to_tune:] + param_groups = encoder_param_groups_to_tune + head_param_groups + return torch.optim.AdamW(param_groups, lr=self.learning_rate) lr = self.learning_rate params = [{"params": self.head.parameters(), "lr": lr}] - # architecture = self.encoder.default_config['architecture'] logging.info(f'Encoder architecture to finetune: {type(self.encoder)}') - # if 'efficientnet' in architecture: if isinstance(self.encoder, timm.models.EfficientNet): # TODO for now, these count as separate layers, not ideal early_tuneable_layers = [self.encoder.conv_stem, self.encoder.bn1] @@ -152,16 +176,9 @@ def configure_optimizers(self): ] elif isinstance(self.encoder, timm.models.MaxxVit): blocks_to_tune = self.encoder.stem + [stage for stage in self.encoder.stages] - # [ - # getattr as obj.0 is not allowed (why does timm call them 0!?) - # getattr(self.encoder.stages, '0'), - # getattr(self.encoder.stages, '1'), - # getattr(self.encoder.stages, '2'), - # getattr(self.encoder.stages, '3'), - # ] else: raise ValueError(f'Encoder architecture not automatically recognised: {type(self.encoder)}') - + assert self.n_blocks <= len( blocks_to_tune ), f"Network only has {len(blocks_to_tune)} tuneable blocks, {self.n_blocks} specified for finetuning" @@ -181,8 +198,6 @@ def configure_optimizers(self): "lr": lr * (self.lr_decay**i) }) - logging.debug(params) - # optionally, for the remaining layers (not otherwise finetuned) you can choose to still FT the batchnorm layers for i, block in enumerate(remaining_blocks): if self.always_train_batchnorm: @@ -200,6 +215,7 @@ def configure_optimizers(self): def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.encoder(x) x = self.head(x) + # TODO encoder output shape changes with input shape (of course) so need to specify explicitly or skip return x def make_step(self, batch): From 938d7ca5c00a7c18ad93e2fedc8659bc165872d0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 29 Jan 2024 13:06:04 -0500 Subject: [PATCH 246/307] careful with nans --- zoobot/pytorch/estimators/define_model.py | 2 +- zoobot/pytorch/training/finetune.py | 114 +++++++++++++++++- zoobot/pytorch/training/losses.py | 16 ++- .../training/train_with_pytorch_lightning.py | 43 +------ 4 files changed, 126 insertions(+), 49 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 5fb8e3e3..4ae39cf7 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -370,7 +370,7 @@ def dirichlet_loss(preds, labels, question_index_groups, sum_over_questions=Fals # multiquestion_loss returns loss of shape (batch, question) # torch.sum(multiquestion_loss, axis=1) gives loss of shape (batch). Equiv. to non-log product of question likelihoods. - multiq_loss = losses.calculate_multiquestion_loss(labels, preds, question_index_groups) + multiq_loss = losses.calculate_multiquestion_loss(labels, preds, question_index_groups, careful=True) if sum_over_questions: return torch.sum(multiq_loss, axis=1) else: diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 26462abc..b3d7dbe9 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -322,7 +322,7 @@ def __init__( super().__init__(**super_kwargs) logging.info('Using classification head and cross-entropy loss') - self.head = LinearClassifier( + self.head = LinearHead( input_dim=self.encoder_dim, output_dim=num_classes, dropout_prob=self.dropout_prob @@ -387,7 +387,7 @@ def predict_step(self, x: Union[list[torch.Tensor], torch.Tensor], batch_idx): # see Abstract version if isinstance(x, list) and len(x) == 1: return self(x[0]) - x = self.forward(x) # type: ignore # logits from LinearClassifier + x = self.forward(x) # type: ignore # logits from LinearHead # then applies softmax return F.softmax(x, dim=1) @@ -407,6 +407,98 @@ def upload_images_to_wandb(self, outputs, batch, batch_idx): caption=captions) + +class FinetuneableZoobotRegressor(FinetuneableZoobotAbstract): + """ + Pretrained Zoobot model intended for finetuning on a regression problem. + + See FinetuneableZoobotClassifier, above + + Args: + None besides those from FinetuneableZoobotAbstract, above (1 class, MSE error, for now) + + """ + + def __init__( + self, + **super_kwargs) -> None: + + super().__init__(**super_kwargs) + + logging.info('Using classification head and cross-entropy loss') + self.head = LinearHead( + input_dim=self.encoder_dim, + output_dim=1, + dropout_prob=self.dropout_prob + ) + self.loss = mse_loss + # rmse metrics. loss is mse already. + self.train_rmse = tm.MeanSquaredError(squared=False) + self.val_rmse = tm.MeanSquaredError(squared=False) + self.test_rmse = tm.MeanSquaredError(squared=False) + + def step_to_dict(self, y, y_pred, loss): + return {'loss': loss.mean(), 'predictions': y_pred, 'labels': y} + + def on_train_batch_end(self, step_output, *args): + super().on_train_batch_end(step_output, *args) + + self.train_rmse(step_output['predictions'], step_output['labels']) + self.log( + 'finetuning/train_rmse', + self.train_rmse, + on_step=False, + on_epoch=True, + prog_bar=self.prog_bar + ) + + def on_validation_batch_end(self, step_output, *args): + super().on_validation_batch_end(step_output, *args) + + self.val_rmse(step_output['predictions'], step_output['labels']) + self.log( + 'finetuning/val_rmse', + self.val_rmse, + on_step=False, + on_epoch=True, + prog_bar=self.prog_bar + ) + + def on_test_batch_end(self, step_output, *args) -> None: + super().on_test_batch_end(step_output, *args) + + self.test_rmse(step_output['predictions'], step_output['labels']) + self.log( + "finetuning/test_rmse", + self.test_rmse, + on_step=False, + on_epoch=True, + prog_bar=self.prog_bar + ) + + + def predict_step(self, x: Union[list[torch.Tensor], torch.Tensor], batch_idx): + # see Abstract version + if isinstance(x, list) and len(x) == 1: + return self(x[0]) + return self.forward(x) + + # TODO + # def upload_images_to_wandb(self, outputs, batch, batch_idx): + # # self.logger is set by pl.Trainer(logger=) argument + # if (self.logger is not None) and (batch_idx == 0): + # x, y = batch + # y_pred_softmax = F.softmax(outputs['predictions'], dim=1) + # n_images = 5 + # images = [img for img in x[:n_images]] + # captions = [f'Ground Truth: {y_i} \nPrediction: {y_p_i}' for y_i, y_p_i in zip( + # y[:n_images], y_pred_softmax[:n_images])] + # self.logger.log_image( # type: ignore + # key='val_images', + # images=images, + # caption=captions) + + class FinetuneableZoobotTree(FinetuneableZoobotAbstract): """ Pretrained Zoobot model intended for finetuning on a decision tree (i.e. GZ-like) problem. @@ -447,10 +539,11 @@ def upload_images_to_wandb(self, outputs, batch, batch_idx): # other functions are simply inherited from FinetunedZoobotAbstract # https://github.com/inigoval/byol/blob/1da1bba7dc5cabe2b47956f9d7c6277decd16cc7/byol_main/networks/models.py#L29 -class LinearClassifier(torch.nn.Module): +class LinearHead(torch.nn.Module): def __init__(self, input_dim, output_dim, dropout_prob=0.5): # input dim is representation dim, output_dim is num classes - super(LinearClassifier, self).__init__() + super(LinearHead, self).__init__() + self.output_dim = output_dim self.dropout = torch.nn.Dropout(p=dropout_prob) self.linear = torch.nn.Linear(input_dim, output_dim) @@ -458,7 +551,11 @@ def forward(self, x): # returns logits, as recommended for CrossEntropy loss x = self.dropout(x) x = self.linear(x) - return x + if self.output_dim == 1: + return x.squeeze() + else: + return x + def cross_entropy_loss(y_pred, y, label_smoothing=0., weight=None): @@ -468,6 +565,13 @@ def cross_entropy_loss(y_pred, y, label_smoothing=0., weight=None): # will reduce myself return F.cross_entropy(y_pred, y.long(), label_smoothing=label_smoothing, weight=weight, reduction='none') +def mse_loss(y_pred, y): + # y should be shape (batch) and ints + # y_pred should be shape (batch, classes) + # returns loss of shape (batch) + # will reduce myself + return F.mse_loss(y_pred, y, reduction='none') + def dirichlet_loss(y_pred, y, question_index_groups): # aggregation equiv. to sum(axis=1).mean(), but fewer operations diff --git a/zoobot/pytorch/training/losses.py b/zoobot/pytorch/training/losses.py index b3c74029..77b15761 100755 --- a/zoobot/pytorch/training/losses.py +++ b/zoobot/pytorch/training/losses.py @@ -1,10 +1,11 @@ from typing import Tuple +import logging import torch import pyro -def calculate_multiquestion_loss(labels: torch.Tensor, predictions: torch.Tensor, question_index_groups: Tuple) -> torch.Tensor: +def calculate_multiquestion_loss(labels: torch.Tensor, predictions: torch.Tensor, question_index_groups: Tuple, careful=True) -> torch.Tensor: """ The full decision tree loss used for training GZ DECaLS models @@ -19,6 +20,16 @@ def calculate_multiquestion_loss(labels: torch.Tensor, predictions: torch.Tensor Returns: torch.Tensor: neg. log likelihood of shape (batch, question). """ + if careful: + # some models give occasional nans for all predictions on a specific galaxy/row + # these are inputs to the loss and only happen many epochs in so probably not a case of bad labels, but rather some instability during training + # handle this by setting loss=0 for those rows and throwing a warning + nan_prediction = torch.isnan(predictions) | torch.isinf(predictions) + if nan_prediction.any(): + logging.warning(f'Found nan values in predictions: {predictions}') + safety_value = torch.ones(1, device=predictions.device, dtype=predictions.dtype) # fill with 1 everywhere i.e. fully uncertain + predictions = torch.where(condition=nan_prediction, input=safety_value, other=predictions) + # very important that question_index_groups is fixed and discrete, else tf.function autograph will mess up q_losses = [] # will give shape errors if model output dim is not labels dim, which can happen if losses.py substrings are missing an answer @@ -104,5 +115,6 @@ def dirichlet_loss(labels_for_q, concentrations_for_q): def get_dirichlet_neg_log_prob(labels_for_q, total_count, concentrations_for_q): # https://docs.pyro.ai/en/stable/distributions.html#dirichletmultinomial # .int()s avoid rounding errors causing loss of around 1e-5 for questions with 0 votes - dist = pyro.distributions.DirichletMultinomial(total_count=total_count.int(), concentration=concentrations_for_q, is_sparse=False, validate_args=False) + dist = pyro.distributions.DirichletMultinomial( + total_count=total_count.int(), concentration=concentrations_for_q, is_sparse=False, validate_args=True) return -dist.log_prob(labels_for_q.int()) # important minus sign diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index c83acfdd..b5cbb504 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -268,13 +268,11 @@ def train_default_zoobot_from_scratch( # these args are automatically logged lightning_model = define_model.ZoobotTree( output_dim=len(schema.label_cols), - # question_index_groups=schema.question_index_groups, # NEW - pass these from schema, for better logging question_answer_pairs=schema.question_answer_pairs, dependencies=schema.dependencies, architecture_name=architecture_name, channels=channels, - # use_imagenet_weights=False, test_time_dropout=True, dropout_rate=dropout_rate, learning_rate=learning_rate, @@ -306,7 +304,6 @@ def train_default_zoobot_from_scratch( early_stopping_callback = EarlyStopping(monitor=monitor_metric, patience=patience, check_finite=True) callbacks = [checkpoint_callback, early_stopping_callback] + extra_callbacks - # callbacks = None trainer = pl.Trainer( num_sanity_val_steps=0, @@ -321,14 +318,9 @@ def train_default_zoobot_from_scratch( max_epochs=epochs, default_root_dir=save_dir, plugins=plugins, - gradient_clip_val=1. # new, for large models - # , - # limit_train_batches=1, - # limit_val_batches=1 - # use_distributed_sampler=use_distributed_sampler + gradient_clip_val=.3 # reduced from 1 to .3, having some nan issues ) - trainer.fit(lightning_model, datamodule) # uses batch size of datamodule best_model_path = trainer.checkpoint_callback.best_model_path @@ -337,44 +329,13 @@ def train_default_zoobot_from_scratch( # also be careful not to test regularly, as this breaks train/val/test conceptual separation and may cause hparam overfitting if datamodule.test_dataloader is not None: logging.info(f'Testing on {checkpoint_callback.best_model_path} with single GPU. Be careful not to overfit your choices to the test data...') - # # test_trainer.validate( - # # model=lightning_model, - # # datamodule=datamodule, - # # ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" - # # ) - # test_trainer = pl.Trainer( - # accelerator=accelerator, - # devices=1, - # precision=precision, - # logger=wandb_logger, - # default_root_dir=save_dir - # ) - # if test_trainer.is_global_zero: - # test_datamodule = webdatamodule.WebDataModule( - # train_urls=None, - # val_urls=None, - # test_urls=test_urls, - # label_cols=schema.label_cols, - # batch_size=batch_size, - # num_workers=1, # 20 / 5 == 4, /2=2 - # prefetch_factor=prefetch_factor, - # cache_dir=None, - # color=color, - # crop_scale_bounds=crop_scale_bounds, - # crop_ratio_bounds=crop_ratio_bounds, - # resize_after_crop=resize_after_crop - # ) datamodule.setup(stage='test') + # TODO with webdataset, no need for new trainer/datamodule (actually it breaks), but might still be needed with normal dataset? trainer.test( model=lightning_model, datamodule=datamodule, ckpt_path=checkpoint_callback.best_model_path # can optionally point to a specific checkpoint here e.g. "/share/nas2/walml/repos/gz-decals-classifiers/results/early_stopping_1xgpu_greyscale/checkpoints/epoch=26-step=16847.ckpt" ) - # else: - # logging.info('Not global zero, skipping test metrics') - # else: - # logging.info('No test dataloader found, skipping test metrics') - # explicitly update the model weights to the best checkpoint before returning # (assumes only one checkpoint callback, very likely in practice) From bb9322e63239ed36febf407e088080a3d4998bd6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 1 Feb 2024 12:38:21 -0500 Subject: [PATCH 247/307] ft tweaks --- zoobot/pytorch/estimators/define_model.py | 2 +- zoobot/pytorch/predictions/predict_on_catalog.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 5fb8e3e3..bd33427e 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -238,7 +238,7 @@ def __init__( if question_answer_pairs is not None: logging.info('question_index_groups/dependencies passed to Zoobot, constructing schema in __init__') - assert question_index_groups is None, "Don't pass both question_index_groups and question_answer_pairs/dependencies" + # assert question_index_groups is None, "Don't pass both question_index_groups and question_answer_pairs/dependencies" assert dependencies is not None self.schema = schemas.Schema(question_answer_pairs, dependencies) # replace with schema-derived version diff --git a/zoobot/pytorch/predictions/predict_on_catalog.py b/zoobot/pytorch/predictions/predict_on_catalog.py index 3a68ab88..0b99270f 100644 --- a/zoobot/pytorch/predictions/predict_on_catalog.py +++ b/zoobot/pytorch/predictions/predict_on_catalog.py @@ -38,14 +38,14 @@ def predict(catalog: pd.DataFrame, model: pl.LightningModule, n_samples: int, la # crucial to specify the stage, or will error (as missing other catalogs) predict_datamodule.setup(stage='predict') # for images in predict_datamodule.predict_dataloader(): - # print(images) - # print(images.shape) + # print(images) + # print(images.shape) + # exit() # set up trainer (again) trainer = pl.Trainer( max_epochs=-1, # does nothing in this context, suppresses warning - inference_mode=True, # no grads needed **trainer_kwargs # e.g. gpus ) From 122165d8f4f026b8ae55a61a316fefeb7d6cc95a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 5 Feb 2024 09:46:01 -0500 Subject: [PATCH 248/307] prep for 2.0 --- README.md | 64 ++++++++++++++++++------------------------------------- setup.py | 2 +- 2 files changed, 22 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 38fc6813..b52d708a 100755 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Download the code using git: git clone git@github.com:mwalmsley/zoobot.git -And then pick one of the three commands below to install Zoobot and either PyTorch (recommended) or TensorFlow: +And then pick one of the three commands below to install Zoobot and PyTorch: # Zoobot with PyTorch and a GPU. Requires CUDA 12.1 (or CUDA 11.8, if you use `_cu118` instead) pip install -e "zoobot[pytorch-cu121]" --extra-index-url https://download.pytorch.org/whl/cu121 @@ -41,9 +41,6 @@ And then pick one of the three commands below to install Zoobot and either PyTor # OR Zoobot with PyTorch on Mac with M1 chip pip install -e "zoobot[pytorch-m1]" - # OR Zoobot with TensorFlow. Works with and without a GPU, but if you have a GPU, you need CUDA 11.2. - pip install -e "zoobot[tensorflow] - This installs the downloaded Zoobot code using pip [editable mode](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) so you can easily change the code locally. Zoobot is also available directly from pip (`pip install zoobot[option]`). Only use this if you are sure you won't be making changes to Zoobot itself. For Google Colab, use `pip install zoobot[pytorch_colab]` To use a GPU, you must *already* have CUDA installed and matching the versions above. @@ -115,12 +112,6 @@ PyTorch (recommended): - [pytorch/examples/representations/get_representations.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/pytorch/examples/representations/get_representations.py) - [pytorch/examples/train_model_on_catalog.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/pytorch/examples/train_model_on_catalog.py) (only necessary to train from scratch) -TensorFlow: -- [tensorflow/examples/train_model_on_catalog.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/tensorflow/examples/train_model_on_catalog.py) (only necessary to train from scratch) -- [tensorflow/examples/make_predictions.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/tensorflow/examples/make_predictions.py) -- [tensorflow/examples/finetune_minimal.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/tensorflow/examples/finetune_minimal.py) -- [tensorflow/examples/finetune_advanced.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/tensorflow/examples/finetune_advanced.py) - There is more explanation and an API reference on the [docs](https://zoobot.readthedocs.io/). I also [include](https://github.com/mwalmsley/zoobot/blob/main/benchmarks) the scripts used to create and benchmark our pretrained models. Many pretrained models are available [already](https://zoobot.readthedocs.io/en/latest/data_notes.html), but if you need one trained on e.g. different input image sizes or with a specific architecture, I can probably make it for you. @@ -129,44 +120,30 @@ When trained with a decision tree head (ZoobotTree, FinetuneableZoobotTree), Zoo -### (Optional) Install PyTorch or TensorFlow, with CUDA +### (Optional) Install PyTorch with CUDA -*If you're not using a GPU, skip this step. Use the pytorch_cpu or tensorflow_cpu options in the section below.* - -Install PyTorch 1.12.1 or Tensorflow 2.10.0 and compatible CUDA drivers. I highly recommend using [conda](https://docs.conda.io/en/latest/miniconda.html) to do this. Conda will handle both creating a new virtual environment (`conda create`) and installing CUDA (`cudatoolkit`, `cudnn`) - -CUDA 11.3 for PyTorch: +*If you're not using a GPU, skip this step. Use the pytorch-cpu option in the section below.* - conda create --name zoobot38_torch python==3.8 - conda activate zoobot38_torch - conda install -c conda-forge cudatoolkit=11.3 +Install PyTorch 2.1.0 or Tensorflow 2.10.0 and compatible CUDA drivers. I highly recommend using [conda](https://docs.conda.io/en/latest/miniconda.html) to do this. Conda will handle both creating a new virtual environment (`conda create`) and installing CUDA (`cudatoolkit`, `cudnn`) -CUDA 11.2 and CUDNN 8.1 for TensorFlow 2.10.0: +CUDA 12.1 for PyTorch 2.1.0: - conda create --name zoobot38_tf python==3.8 - conda activate zoobot38_tf - conda install -c conda-forge cudatoolkit=11.2 cudnn=8.1.0 - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/ # add this environment variable + conda create --name zoobot39_torch python==3.9 + conda activate zoobot39_torch + conda install -c conda-forge cudatoolkit=12.1 -### Latest minor features (v1.0.4) -- Now supports multi-class finetuning. See `pytorch/examples/finetuning/finetune_multiclass_classification.py` -- Removed `simplejpeg` dependency due to M1 install issue. -- Pinned `timm` version to ensure MaX-ViT models load correctly. Models supporting the latest `timm` will follow. -- (internal until published) GZ Evo v2 now includes Cosmic Dawn (HSC). Significant performance improvement on HSC finetuning. +### Recent release features (v2.0.0) -### Latest major features (v1.0.0) - -v1.0.0 recognises that most of the complexity in this repo is training Zoobot from scratch, but most non-GZ users will probably simply want to load the pretrained Zoobot and finetune it on their data. - -- Adds new finetuning interface (`finetune.run_finetuning()`), examples. -- Refocuses docs on finetuning rather than training from scratch. -- Rework installation process to separate CUDA from Zoobot (simpler, easier) -- Better wandb logging throughout, to monitor training -- Remove need to make TFRecords. Now TF directly uses images. -- Refactor out augmentations and datasets to `galaxy-datasets` repo. TF and Torch now use identical augmentations (via albumentations). -- Many small quality-of-life improvements +- New pretrained architectures: EfficientnetV2 S/M/L, MaxViT tiny/small/base, ViT tiny/small, and more. +- Reworked finetuning procedure. All these architectures are finetuneable through a common method. +- Now supports regression finetuning (as well as multi-class and binary). See `pytorch/examples/finetuning` +- Updated `timm` to 0.9.10, allowing latest model architectures. Previously downloaded checkpoints may not load correctly! +- (internal until published) GZ Evo v2 now includes Cosmic Dawn (HSC H2O). Significant performance improvement on HSC finetuning. Also now includes GZ UKIDSS (dragged from our archives). +- Updated `pytorch` to `2.1.0` +- Added option to compile encoder for max speed (not recommended for finetuning, only for pretraining). +- Deprecates TensorFlow. The CS research community focuses on PyTorch and new frameworks like JAX. Contributions are very welcome and will be credited in any future work. Please get in touch! See [CONTRIBUTING.md](https://github.com/mwalmsley/zoobot/blob/main/benchmarks) for more. @@ -176,6 +153,8 @@ The [benchmarks](https://github.com/mwalmsley/zoobot/blob/main/benchmarks) folde Training Zoobot using the GZ DECaLS dataset option will create models very similar to those used for the GZ DECaLS catalogue and shared with the early versions of this repo. The GZ DESI Zoobot model is trained on additional data (GZD-1, GZD-2), as the GZ Evo Zoobot model (GZD-1/2/5, Hubble, Candels, GZ2). +**Pretraining is becoming increasingly complex and is now partially refactored out to a separate repository. We are gradually migrating this `zoobot` repository to focus on finetuning.** + ### Citing If you use this software, or otherwise wish to cite Zoobot as a software package, please use the [JOSS paper](https://doi.org/10.21105/joss.05312): @@ -189,10 +168,9 @@ You might be interested in reading papers using Zoobot: - [Practical Galaxy Morphology Tools from Deep Supervised Representation Learning](https://arxiv.org/abs/2110.12735) (2022) - [Towards Foundation Models for Galaxy Morphology](https://arxiv.org/abs/2206.11927) (2022) - [Harnessing the Hubble Space Telescope Archives: A Catalogue of 21,926 Interacting Galaxies](https://arxiv.org/abs/2303.00366) (2023) -- [Astronomaly at Scale: Searching for Anomalies Amongst 4 Million Galaxies](https://arxiv.org/abs/2309.08660) (2023) - [Galaxy Zoo DESI: Detailed morphology measurements for 8.7M galaxies in the DESI Legacy Imaging Surveys](https://academic.oup.com/mnras/advance-article/doi/10.1093/mnras/stad2919/7283169?login=false) (2023) - [Galaxy mergers in Subaru HSC-SSP: A deep representation learning approach for identification, and the role of environment on merger incidence](https://doi.org/10.1051/0004-6361/202346743) (2023) - - +- [Astronomaly at Scale: Searching for Anomalies Amongst 4 Million Galaxies](https://arxiv.org/abs/2309.08660) (2023, submitted) +- [Transfer learning for galaxy feature detection: Finding Giant Star-forming Clumps in low redshift galaxies using Faster R-CNN](https://arxiv.org/abs/2312.03503) (2023, submitted) Many other works use Zoobot indirectly via the [Galaxy Zoo DECaLS](https://arxiv.org/abs/2102.08414) catalog (and now via the new [Galaxy Zoo DESI](https://academic.oup.com/mnras/advance-article/doi/10.1093/mnras/stad2919/7283169?login=false) catalog). diff --git a/setup.py b/setup.py index 712233e5..1bf92be6 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="zoobot", - version="1.0.6", + version="2.0.0", author="Mike Walmsley", author_email="walmsleymk1@gmail.com", description="Galaxy morphology classifiers", From adcce7d37972405a35e450ef1791c5f083d950f2 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 5 Feb 2024 13:18:10 -0500 Subject: [PATCH 249/307] typo --- zoobot/pytorch/estimators/define_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 1f7e8237..72ddf269 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -264,7 +264,7 @@ def __init__( self.encoder = torch.compile(self.encoder) # bit lazy assuming 224 input size - self.encoder_dim = get_encoder_dim(self.encoder) + self.encoder_dim = get_encoder_dim(self.encoder, channels) # typically encoder_dim=1280 for effnetb0 logging.info('encoder dim: {}'.format(self.encoder_dim)) @@ -379,12 +379,12 @@ def dirichlet_loss(preds, labels, question_index_groups, sum_over_questions=Fals # input_size doesn't matter as long as it's large enough to not be pooled to zero # channels doesn't matter at all -def get_encoder_dim(encoder): +def get_encoder_dim(encoder, channels=3): try: - x = torch.randn(1, 3, 224, 224) # BCHW + x = torch.randn(1, channels, 224, 224) # BCHW return encoder(x).shape[-1] except RuntimeError: # tensor might not be on same device as model, just try the only other option - x = torch.randn(1, 3, 224, 224).to('cuda') + x = torch.randn(1, channels, 224, 224).to('cuda') return encoder(x).shape[-1] From 3095b5a5c625b6671fb3e51e0aa4ae29e8b7e2c8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 6 Feb 2024 15:06:39 -0500 Subject: [PATCH 250/307] maxvit fix --- zoobot/pytorch/training/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index b3d7dbe9..0a846cb9 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -175,7 +175,7 @@ def configure_optimizers(self): self.encoder.layer4 ] elif isinstance(self.encoder, timm.models.MaxxVit): - blocks_to_tune = self.encoder.stem + [stage for stage in self.encoder.stages] + blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] else: raise ValueError(f'Encoder architecture not automatically recognised: {type(self.encoder)}') From 8ae3e5a7f827595fb29452e0bab8ec9184792921 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 8 Feb 2024 17:53:09 +0000 Subject: [PATCH 251/307] add sync batchnorm option --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index b5cbb504..967d3409 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -4,6 +4,7 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.plugins import TorchSyncBatchNorm from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks.early_stopping import EarlyStopping @@ -49,6 +50,7 @@ def train_default_zoobot_from_scratch( # hardware parameters nodes=1, gpus=2, + sync_batchnorm=False, num_workers=4, prefetch_factor=4, mixed_precision=False, @@ -283,6 +285,11 @@ def train_default_zoobot_from_scratch( weight_decay=weight_decay, scheduler_params=scheduler_params ) + + if sync_batchnorm: + logging.info('Using sync batchnorm') + lightning_model = TorchSyncBatchNorm.apply(lightning_model) + extra_callbacks = extra_callbacks if extra_callbacks else [] From f3540786bf50d86050ffcf5bf0a6fcbc69ac0873 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 9 Feb 2024 22:20:52 +0000 Subject: [PATCH 252/307] try sync again --- zoobot/pytorch/estimators/define_model.py | 19 ++++++++++++------- .../training/train_with_pytorch_lightning.py | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 72ddf269..04a7d2e4 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -264,6 +264,7 @@ def __init__( self.encoder = torch.compile(self.encoder) # bit lazy assuming 224 input size + # logging.warning(channels) self.encoder_dim = get_encoder_dim(self.encoder, channels) # typically encoder_dim=1280 for effnetb0 logging.info('encoder dim: {}'.format(self.encoder_dim)) @@ -378,17 +379,21 @@ def dirichlet_loss(preds, labels, question_index_groups, sum_over_questions=Fals # input_size doesn't matter as long as it's large enough to not be pooled to zero -# channels doesn't matter at all +# channels doesn't matter at all but has to match encoder channels or shape error def get_encoder_dim(encoder, channels=3): try: - x = torch.randn(1, channels, 224, 224) # BCHW + x = torch.randn(2, channels, 224, 224, device=encoder.device) # BCHW return encoder(x).shape[-1] - except RuntimeError: # tensor might not be on same device as model, just try the only other option - x = torch.randn(1, channels, 224, 224).to('cuda') - return encoder(x).shape[-1] - - + except RuntimeError as e: + if 'channels instead' in str(e): + logging.info('encoder dim search failed on channels, trying with channels=1') + channels = 1 + x = torch.randn(2, channels, 224, 224, device=encoder.device) # BCHW + return encoder(x).shape[-1] + else: + raise e + def get_pytorch_encoder( architecture_name='efficientnet_b0', channels=1, diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 967d3409..1877cdbc 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -288,7 +288,7 @@ def train_default_zoobot_from_scratch( if sync_batchnorm: logging.info('Using sync batchnorm') - lightning_model = TorchSyncBatchNorm.apply(lightning_model) + lightning_model = TorchSyncBatchNorm().apply(lightning_model) extra_callbacks = extra_callbacks if extra_callbacks else [] From 164448cbf9ef24a9d8127a5c4fe869d01d1c3093 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 11 Feb 2024 11:00:50 +0000 Subject: [PATCH 253/307] fix encoder dim --- zoobot/pytorch/estimators/define_model.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 04a7d2e4..d35dc151 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -381,14 +381,15 @@ def dirichlet_loss(preds, labels, question_index_groups, sum_over_questions=Fals # input_size doesn't matter as long as it's large enough to not be pooled to zero # channels doesn't matter at all but has to match encoder channels or shape error def get_encoder_dim(encoder, channels=3): + device = next(encoder.parameters()).device try: - x = torch.randn(2, channels, 224, 224, device=encoder.device) # BCHW + x = torch.randn(2, channels, 224, 224, device=device) # BCHW return encoder(x).shape[-1] except RuntimeError as e: if 'channels instead' in str(e): logging.info('encoder dim search failed on channels, trying with channels=1') channels = 1 - x = torch.randn(2, channels, 224, 224, device=encoder.device) # BCHW + x = torch.randn(2, channels, 224, 224, device=device) # BCHW return encoder(x).shape[-1] else: raise e @@ -474,3 +475,9 @@ def get_pytorch_dirichlet_head(encoder_dim: int, output_dim: int, test_time_drop def schema_to_campaigns(schema): # e.g. [gz2, dr12, ...] return [question.text.split('-')[-1] for question in schema.questions] + + +if __name__ == '__main__': + encoder = get_pytorch_encoder(channels=1) + dim = get_encoder_dim(encoder, channels=1) + print(dim) \ No newline at end of file From f48f5ecbcfcdf2729ad3b2fe2167f16e95b5efea Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 26 Feb 2024 13:04:17 -0500 Subject: [PATCH 254/307] unit interval --- zoobot/pytorch/training/finetune.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 0a846cb9..144ce79b 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -421,15 +421,24 @@ class FinetuneableZoobotRegressor(FinetuneableZoobotAbstract): def __init__( self, + unit_interval=False, **super_kwargs) -> None: super().__init__(**super_kwargs) + self.unit_interval = unit_interval + if self.unit_interval: + logging.info('unit_interval=True, using sigmoid activation for finetunng head') + head_activation = torch.nn.functional.sigmoid + else: + head_activation = None + logging.info('Using classification head and cross-entropy loss') self.head = LinearHead( input_dim=self.encoder_dim, output_dim=1, - dropout_prob=self.dropout_prob + dropout_prob=self.dropout_prob, + activation=head_activation ) self.loss = mse_loss # rmse metrics. loss is mse already. @@ -540,17 +549,21 @@ def upload_images_to_wandb(self, outputs, batch, batch_idx): # https://github.com/inigoval/byol/blob/1da1bba7dc5cabe2b47956f9d7c6277decd16cc7/byol_main/networks/models.py#L29 class LinearHead(torch.nn.Module): - def __init__(self, input_dim, output_dim, dropout_prob=0.5): + def __init__(self, input_dim, output_dim, dropout_prob=0.5, activation=None): # input dim is representation dim, output_dim is num classes super(LinearHead, self).__init__() self.output_dim = output_dim self.dropout = torch.nn.Dropout(p=dropout_prob) self.linear = torch.nn.Linear(input_dim, output_dim) + if activation is not None: + self.activation = activation def forward(self, x): # returns logits, as recommended for CrossEntropy loss x = self.dropout(x) x = self.linear(x) + if self.activation is not None: + x = self.activation(x) if self.output_dim == 1: return x.squeeze() else: From 797b7488512f40924dc333d4f303977e6db94680 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Mon, 26 Feb 2024 22:36:57 -0500 Subject: [PATCH 255/307] typo --- zoobot/pytorch/training/finetune.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 144ce79b..7e963a99 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -555,8 +555,7 @@ def __init__(self, input_dim, output_dim, dropout_prob=0.5, activation=None): self.output_dim = output_dim self.dropout = torch.nn.Dropout(p=dropout_prob) self.linear = torch.nn.Linear(input_dim, output_dim) - if activation is not None: - self.activation = activation + self.activation = activation def forward(self, x): # returns logits, as recommended for CrossEntropy loss From 9eb0d89408eab28eb0ddd1b2eed72011fb769563 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 27 Feb 2024 13:20:27 -0500 Subject: [PATCH 256/307] fix typo --- zoobot/pytorch/training/finetune.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 7e963a99..7902db7e 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -108,12 +108,13 @@ def __init__( else: self.encoder_dim = define_model.get_encoder_dim(self.encoder) self.n_blocks = n_blocks + logging.info('Blocks to finetune: {}'.format(n_layers)) # for backwards compat. if n_layers: logging.warning('FinetuneableZoobot(n_layers) is now renamed to n_blocks, please update to pass n_blocks instead! For now, setting n_blocks=n_layers') self.n_blocks = n_layers - logging.info('Layers to finetune: {}'.format(n_layers)) + logging.info('Layers to finetune: {}'.format(n_layers)) self.learning_rate = learning_rate self.lr_decay = lr_decay From b2f8e395926348574eb1c1595f05c88ba058042d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 27 Feb 2024 16:35:22 -0500 Subject: [PATCH 257/307] add convnext support --- zoobot/pytorch/training/finetune.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 7902db7e..4b461555 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -159,7 +159,7 @@ def configure_optimizers(self): logging.info(f'Encoder architecture to finetune: {type(self.encoder)}') - if isinstance(self.encoder, timm.models.EfficientNet): + if isinstance(self.encoder, timm.models.EfficientNet): # includes v2 # TODO for now, these count as separate layers, not ideal early_tuneable_layers = [self.encoder.conv_stem, self.encoder.bn1] encoder_blocks = list(self.encoder.blocks) @@ -177,6 +177,9 @@ def configure_optimizers(self): ] elif isinstance(self.encoder, timm.models.MaxxVit): blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] + elif isinstance(self.encoder, timm.models.ConvNeXt): # stem + 3 blocks, for all sizes + # https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py#L264 + blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] else: raise ValueError(f'Encoder architecture not automatically recognised: {type(self.encoder)}') From 772660d5a7f7518c93610da5aca93a01e2cecd16 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 27 Feb 2024 16:37:30 -0500 Subject: [PATCH 258/307] docs --- zoobot/pytorch/training/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 4b461555..6e9136fa 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -177,7 +177,7 @@ def configure_optimizers(self): ] elif isinstance(self.encoder, timm.models.MaxxVit): blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] - elif isinstance(self.encoder, timm.models.ConvNeXt): # stem + 3 blocks, for all sizes + elif isinstance(self.encoder, timm.models.ConvNeXt): # stem + 4 blocks, for all sizes # https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py#L264 blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] else: From 90c33f5ae510f9c73cb305309b1676f22af276c1 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 29 Feb 2024 20:01:48 -0500 Subject: [PATCH 259/307] add cosine support --- zoobot/pytorch/training/finetune.py | 40 ++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 6e9136fa..84aa14ba 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -9,6 +9,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint +from pytorch_lightning.callbacks import LearningRateMonitor import torch import torch.nn.functional as F @@ -67,7 +68,6 @@ def __init__( zoobot_checkpoint_loc=None, # ...or directly pass any model to use as encoder (if you do this, you will need to keep it around for later) encoder=None, - n_epochs=100, # TODO early stopping n_blocks=0, # how many layers deep to FT lr_decay=0.75, weight_decay=0.05, @@ -77,7 +77,12 @@ def __init__( prog_bar=True, visualize_images=False, # upload examples to wandb, good for debugging seed=42, - n_layers=0 # for backward compat., n_blocks preferred + n_layers=0, # for backward compat., n_blocks preferred + # these args are for the optional learning rate scheduler, best not to use unless you've tuned everything else already + cosine_schedule=False, + warmup_epochs=10, + max_cosine_epochs=100, + max_learning_rate_reduction_factor=0.01 ): super().__init__() @@ -120,7 +125,11 @@ def __init__( self.lr_decay = lr_decay self.weight_decay = weight_decay self.dropout_prob = dropout_prob - self.n_epochs = n_epochs + + self.cosine_schedule = cosine_schedule + self.warmup_epochs = warmup_epochs + self.max_cosine_epochs = max_cosine_epochs + self.max_learning_rate_reduction_factor = max_learning_rate_reduction_factor self.always_train_batchnorm = always_train_batchnorm if self.always_train_batchnorm: @@ -213,8 +222,25 @@ def configure_optimizers(self): # Initialize AdamW optimizer opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict - return opt - + if self.cosine_schedule: + from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + return { + "optimizer": opt, + "lr_scheduler": { + "scheduler": CosineWarmupScheduler( + optimizer=opt, + warmup_epochs=self.warmup_epochs, + max_epochs=self.max_cosine_epochs, + start_value=self.learning_rate, + end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + ), + 'interval': 'epoch', + "frequency": 1 + } + } + else: + return opt + def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.encoder(x) @@ -669,10 +695,12 @@ def get_trainer( patience=patience ) + learning_rate_monitor_callback = LearningRateMonitor(logging_interval='epoch') + # Initialise pytorch lightning trainer trainer = pl.Trainer( logger=logger, - callbacks=[checkpoint_callback, early_stopping_callback], + callbacks=[checkpoint_callback, early_stopping_callback, learning_rate_monitor_callback], max_epochs=max_epochs, accelerator=accelerator, devices=devices, From 54f02bf8c9104157a51b2d966056dbde3ba25d5c Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 1 Mar 2024 15:32:55 -0500 Subject: [PATCH 260/307] two fixes wds loads /255**2 wrongly, leave but warn finetune never tuned batchnorm, raise error --- zoobot/pytorch/datasets/webdatamodule.py | 10 ++- zoobot/pytorch/training/finetune.py | 93 ++++++++++++++++-------- 2 files changed, 72 insertions(+), 31 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index cc5d5800..05ce84e4 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -79,7 +79,9 @@ def make_image_transform(self, mode="train"): crop_scale_bounds=self.crop_scale_bounds, crop_ratio_bounds=self.crop_ratio_bounds, resize_after_crop=self.resize_after_crop, - pytorch_greyscale=not self.color + pytorch_greyscale=not self.color, + to_float=True # wrong, webdataset rgb decoder already converts to 0-1 float + # TODO this must be changed! will be different for new model training runs ) # A.Compose object # logging.warning('Minimal augmentations for speed test') @@ -90,8 +92,12 @@ def make_image_transform(self, mode="train"): def do_transform(img): + # img is 0-1 np array, intended for albumentations assert img.shape[2] < 4 # 1 or 3 channels in shape[2] dim, i.e. numpy/pil HWC convention # if not, check decode mode is 'rgb' not 'torchrgb' + # TODO could likely use torch ToTensorV2 here instead of returning np float32 + # TODO or could transform in uint8 as I do with torchvision + # TODO need to generally rationalise my transform options return np.transpose(augmentation_transform(image=np.array(img))["image"], axes=[2, 0, 1]).astype(np.float32) return do_transform @@ -108,7 +114,7 @@ def make_loader(self, urls, mode="train"): if self.train_transform is None: logging.info('Using default transform') - decode_mode = 'rgb' # np.array, for albumentations + decode_mode = 'rgb' # loads 0-1 np.array, for albumentations transform_image = self.make_image_transform(mode=mode) else: logging.info('Ignoring hparams and using directly-passed transforms') diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 84aa14ba..0d3f52e0 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -144,6 +144,22 @@ def __init__( self.visualize_images = visualize_images def configure_optimizers(self): + """ + This controls which parameters get optimized + + self.head is always optimized, with no learning rate decay + when self.n_blocks == 0, only self.head is optimized (i.e. frozen* encoder) + + for self.encoder, we enumerate the blocks (groups of layers) to potentially finetune + and then pick the top self.n_blocks to finetune + + weight_decay is applied to both the head and (if relevant) the encoder + learning rate decay is applied to the encoder only: lr * (lr_decay**block_n), ignoring the head (block 0) + + What counts as a "block" is a bit fuzzy, but I generally use the self.encoder.stages from timm. I also count the stem as a block. + + *batch norm layers may optionally still have updated statistics using always_train_batchnorm + """ if isinstance(self.encoder, CustomMAEEncoder): logging.info('Using custom optimizer for MAE encoder') @@ -172,10 +188,10 @@ def configure_optimizers(self): # TODO for now, these count as separate layers, not ideal early_tuneable_layers = [self.encoder.conv_stem, self.encoder.bn1] encoder_blocks = list(self.encoder.blocks) - blocks_to_tune = early_tuneable_layers + encoder_blocks + tuneable_blocks = early_tuneable_layers + encoder_blocks elif isinstance(self.encoder, timm.models.ResNet): # all timm resnets seem to have this structure - blocks_to_tune = [ + tuneable_blocks = [ # similarly self.encoder.conv1, self.encoder.bn1, @@ -185,24 +201,28 @@ def configure_optimizers(self): self.encoder.layer4 ] elif isinstance(self.encoder, timm.models.MaxxVit): - blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] + tuneable_blocks = [self.encoder.stem] + [stage for stage in self.encoder.stages] elif isinstance(self.encoder, timm.models.ConvNeXt): # stem + 4 blocks, for all sizes # https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py#L264 - blocks_to_tune = [self.encoder.stem] + [stage for stage in self.encoder.stages] + tuneable_blocks = [self.encoder.stem] + [stage for stage in self.encoder.stages] else: raise ValueError(f'Encoder architecture not automatically recognised: {type(self.encoder)}') assert self.n_blocks <= len( - blocks_to_tune - ), f"Network only has {len(blocks_to_tune)} tuneable blocks, {self.n_blocks} specified for finetuning" + tuneable_blocks + ), f"Network only has {len(tuneable_blocks)} tuneable blocks, {self.n_blocks} specified for finetuning" # take n blocks, ordered highest layer to lowest layer - blocks_to_tune.reverse() + tuneable_blocks.reverse() + logging.info('possible blocks to tune: {}'.format(len(tuneable_blocks))) # will finetune all params in first N - blocks_to_tune = blocks_to_tune[:self.n_blocks] + logging.info('blocks that will be tuned: {}'.format(self.n_blocks)) + blocks_to_tune = tuneable_blocks[:self.n_blocks] # optionally, can finetune batchnorm params in remaining layers - remaining_blocks = blocks_to_tune[self.n_blocks:] + remaining_blocks = tuneable_blocks[self.n_blocks:] + logging.info('Remaining blocks: {}'.format(len(remaining_blocks))) + assert not any([block in remaining_blocks for block in blocks_to_tune]), 'Some blocks are in both tuneable and remaining' # Append parameters of layers for finetuning along with decayed learning rate for i, block in enumerate(blocks_to_tune): # _ is the block name e.g. '3' @@ -214,11 +234,21 @@ def configure_optimizers(self): # optionally, for the remaining layers (not otherwise finetuned) you can choose to still FT the batchnorm layers for i, block in enumerate(remaining_blocks): if self.always_train_batchnorm: - params.append({ - "params": get_batch_norm_params_lighting(block), - "lr": lr * (self.lr_decay**i) - }) - + raise NotImplementedError + # _, block_batch_norm_params = get_batch_norm_params_lighting(block) + # params.append({ + # "params": block_batch_norm_params, + # "lr": lr * (self.lr_decay**i) + # }) + + + logging.info('param groups: {}'.format(len(params))) + for param_group_n, param_group in enumerate(params): + shapes_within_param_group = [p.shape for p in list(param_group['params'])] + logging.info('param group {}: {}'.format(param_group_n, shapes_within_param_group)) + # print('head params to optimize', [p.shape for p in params[0]['params']]) # head only + # print(list(param_group['params']) for param_group in params) + # exit() # Initialize AdamW optimizer opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict @@ -710,22 +740,27 @@ def get_trainer( return trainer # TODO check exactly which layers get FTd -def is_tuneable(block_of_layers): - if len(list(block_of_layers.parameters())) == 0: - logging.info('Skipping block with no params') - logging.info(block_of_layers) - return False - else: - # currently, allowed to include batchnorm - return True +# def is_tuneable(block_of_layers): +# if len(list(block_of_layers.parameters())) == 0: +# logging.info('Skipping block with no params') +# logging.info(block_of_layers) +# return False +# else: +# # currently, allowed to include batchnorm +# return True -def get_batch_norm_params_lighting(parent_module, current_params=[]): - for child_module in parent_module.children(): - if isinstance(child_module, torch.nn.BatchNorm2d): - current_params += child_module.parameters() - else: - current_params = get_batch_norm_params_lighting(child_module, current_params) - return current_params +# def get_batch_norm_params_lighting(parent_module, checked_params=set(), batch_norm_params=[]): + +# modules = parent_module.modules() +# for module in modules: +# if id(module) not in checked_params: +# checked_params.add(id(module)) +# if isinstance(module, torch.nn.BatchNorm2d): +# batch_norm_params += module.parameters() +# else: +# checked_params, batch_norm_params = get_batch_norm_params_lighting(module, checked_params, batch_norm_params) + +# return checked_params, batch_norm_params From 741bda348925619d156e3c544e01e13d65fce838 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 08:49:53 -0500 Subject: [PATCH 261/307] add cosine logging --- zoobot/pytorch/training/finetune.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 0d3f52e0..9f818d89 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -253,6 +253,7 @@ def configure_optimizers(self): opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict if self.cosine_schedule: + logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy return { "optimizer": opt, From 9d8b79141c4cad7a57da669454550d744e118b3f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 09:07:34 -0500 Subject: [PATCH 262/307] simplify --- zoobot/pytorch/training/finetune.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 9f818d89..31f4ff06 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -255,20 +255,20 @@ def configure_optimizers(self): if self.cosine_schedule: logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + lr_scheduler = CosineWarmupScheduler( + optimizer=opt, + warmup_epochs=self.warmup_epochs, + max_epochs=self.max_cosine_epochs, + start_value=self.learning_rate, + end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + ) + # lr_scheduler_config default is frequency=1, interval=epoch return { "optimizer": opt, - "lr_scheduler": { - "scheduler": CosineWarmupScheduler( - optimizer=opt, - warmup_epochs=self.warmup_epochs, - max_epochs=self.max_cosine_epochs, - start_value=self.learning_rate, - end_value=self.learning_rate * self.max_learning_rate_reduction_factor, - ), - 'interval': 'epoch', - "frequency": 1 - } - } + "lr_scheduler": lr_scheduler + } else: return opt From 09c70ba6e8dbe4c59ebbfa9c937d070ecfbbeb98 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 09:26:43 -0500 Subject: [PATCH 263/307] pure pytorch? --- zoobot/pytorch/training/finetune.py | 32 ++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 31f4ff06..412752f8 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -253,21 +253,33 @@ def configure_optimizers(self): opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict if self.cosine_schedule: - logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) - from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy - # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers - # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - lr_scheduler = CosineWarmupScheduler( + # logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) + # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + # lr_scheduler = CosineWarmupScheduler( + # optimizer=opt, + # warmup_epochs=self.warmup_epochs, + # max_epochs=self.max_cosine_epochs, + # start_value=self.learning_rate, + # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + # ) + + logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) + lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=opt, - warmup_epochs=self.warmup_epochs, - max_epochs=self.max_cosine_epochs, - start_value=self.learning_rate, - end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + T_max=self.max_cosine_epochs, + eta_min=self.learning_rate * self.max_learning_rate_reduction_factor ) + # lr_scheduler_config default is frequency=1, interval=epoch return { "optimizer": opt, - "lr_scheduler": lr_scheduler + "lr_scheduler": { + 'scheduler': lr_scheduler, + 'interval': 'epoch', + 'frequency': 1 + } } else: return opt From e2c509e1e17339bf646e4b6d92ef600732df9cbe Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 09:31:50 -0500 Subject: [PATCH 264/307] cosine logging --- zoobot/pytorch/training/finetune.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 412752f8..27bc8ec7 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -245,12 +245,13 @@ def configure_optimizers(self): logging.info('param groups: {}'.format(len(params))) for param_group_n, param_group in enumerate(params): shapes_within_param_group = [p.shape for p in list(param_group['params'])] - logging.info('param group {}: {}'.format(param_group_n, shapes_within_param_group)) + logging.debug('param group {}: {}'.format(param_group_n, shapes_within_param_group)) # print('head params to optimize', [p.shape for p in params[0]['params']]) # head only # print(list(param_group['params']) for param_group in params) # exit() # Initialize AdamW optimizer opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict + logging.info('Optimizer ready, configuring scheduler') if self.cosine_schedule: # logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) @@ -282,6 +283,7 @@ def configure_optimizers(self): } } else: + logging.info('Learning rate scheduler not used') return opt From 31455423dc416bcc8b5016637c0936139da7a326 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 11:17:57 -0500 Subject: [PATCH 265/307] partially reverse two fixes changes --- zoobot/pytorch/training/finetune.py | 84 ++++++++++++++--------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 27bc8ec7..d8f7c2e1 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -220,9 +220,9 @@ def configure_optimizers(self): logging.info('blocks that will be tuned: {}'.format(self.n_blocks)) blocks_to_tune = tuneable_blocks[:self.n_blocks] # optionally, can finetune batchnorm params in remaining layers - remaining_blocks = tuneable_blocks[self.n_blocks:] - logging.info('Remaining blocks: {}'.format(len(remaining_blocks))) - assert not any([block in remaining_blocks for block in blocks_to_tune]), 'Some blocks are in both tuneable and remaining' + # remaining_blocks = tuneable_blocks[self.n_blocks:] + # logging.info('Remaining blocks: {}'.format(len(remaining_blocks))) + # assert not any([block in remaining_blocks for block in blocks_to_tune]), 'Some blocks are in both tuneable and remaining' # Append parameters of layers for finetuning along with decayed learning rate for i, block in enumerate(blocks_to_tune): # _ is the block name e.g. '3' @@ -232,9 +232,9 @@ def configure_optimizers(self): }) # optionally, for the remaining layers (not otherwise finetuned) you can choose to still FT the batchnorm layers - for i, block in enumerate(remaining_blocks): - if self.always_train_batchnorm: - raise NotImplementedError + # for i, block in enumerate(remaining_blocks): + # if self.always_train_batchnorm: + # raise NotImplementedError # _, block_batch_norm_params = get_batch_norm_params_lighting(block) # params.append({ # "params": block_batch_norm_params, @@ -242,10 +242,10 @@ def configure_optimizers(self): # }) - logging.info('param groups: {}'.format(len(params))) - for param_group_n, param_group in enumerate(params): - shapes_within_param_group = [p.shape for p in list(param_group['params'])] - logging.debug('param group {}: {}'.format(param_group_n, shapes_within_param_group)) + # logging.info('param groups: {}'.format(len(params))) + # for param_group_n, param_group in enumerate(params): + # shapes_within_param_group = [p.shape for p in list(param_group['params'])] + # logging.debug('param group {}: {}'.format(param_group_n, shapes_within_param_group)) # print('head params to optimize', [p.shape for p in params[0]['params']]) # head only # print(list(param_group['params']) for param_group in params) # exit() @@ -253,38 +253,38 @@ def configure_optimizers(self): opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict logging.info('Optimizer ready, configuring scheduler') - if self.cosine_schedule: - # logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) - # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy - # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers - # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - # lr_scheduler = CosineWarmupScheduler( - # optimizer=opt, - # warmup_epochs=self.warmup_epochs, - # max_epochs=self.max_cosine_epochs, - # start_value=self.learning_rate, - # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, - # ) - - logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) - lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer=opt, - T_max=self.max_cosine_epochs, - eta_min=self.learning_rate * self.max_learning_rate_reduction_factor - ) - - # lr_scheduler_config default is frequency=1, interval=epoch - return { - "optimizer": opt, - "lr_scheduler": { - 'scheduler': lr_scheduler, - 'interval': 'epoch', - 'frequency': 1 - } - } - else: - logging.info('Learning rate scheduler not used') - return opt + # if self.cosine_schedule: + # # logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) + # # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + # # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + # # lr_scheduler = CosineWarmupScheduler( + # # optimizer=opt, + # # warmup_epochs=self.warmup_epochs, + # # max_epochs=self.max_cosine_epochs, + # # start_value=self.learning_rate, + # # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + # # ) + + # logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) + # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + # optimizer=opt, + # T_max=self.max_cosine_epochs, + # eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + # ) + + # # lr_scheduler_config default is frequency=1, interval=epoch + # return { + # "optimizer": opt, + # "lr_scheduler": { + # 'scheduler': lr_scheduler, + # 'interval': 'epoch', + # 'frequency': 1 + # } + # } + # else: + # logging.info('Learning rate scheduler not used') + return opt def forward(self, x: torch.Tensor) -> torch.Tensor: From f99dfd0173f2b5a4fc288fc57a70bfd276cb3472 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 11:22:27 -0500 Subject: [PATCH 266/307] carefully start adding back --- zoobot/pytorch/training/finetune.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index d8f7c2e1..5608fa43 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -220,9 +220,9 @@ def configure_optimizers(self): logging.info('blocks that will be tuned: {}'.format(self.n_blocks)) blocks_to_tune = tuneable_blocks[:self.n_blocks] # optionally, can finetune batchnorm params in remaining layers - # remaining_blocks = tuneable_blocks[self.n_blocks:] - # logging.info('Remaining blocks: {}'.format(len(remaining_blocks))) - # assert not any([block in remaining_blocks for block in blocks_to_tune]), 'Some blocks are in both tuneable and remaining' + remaining_blocks = tuneable_blocks[self.n_blocks:] + logging.info('Remaining blocks: {}'.format(len(remaining_blocks))) + assert not any([block in remaining_blocks for block in blocks_to_tune]), 'Some blocks are in both tuneable and remaining' # Append parameters of layers for finetuning along with decayed learning rate for i, block in enumerate(blocks_to_tune): # _ is the block name e.g. '3' @@ -232,9 +232,9 @@ def configure_optimizers(self): }) # optionally, for the remaining layers (not otherwise finetuned) you can choose to still FT the batchnorm layers - # for i, block in enumerate(remaining_blocks): - # if self.always_train_batchnorm: - # raise NotImplementedError + for i, block in enumerate(remaining_blocks): + if self.always_train_batchnorm: + raise NotImplementedError # _, block_batch_norm_params = get_batch_norm_params_lighting(block) # params.append({ # "params": block_batch_norm_params, @@ -242,7 +242,7 @@ def configure_optimizers(self): # }) - # logging.info('param groups: {}'.format(len(params))) + logging.info('param groups: {}'.format(len(params))) # for param_group_n, param_group in enumerate(params): # shapes_within_param_group = [p.shape for p in list(param_group['params'])] # logging.debug('param group {}: {}'.format(param_group_n, shapes_within_param_group)) From ce2c80a3054fe67341a5cd89a503809a1226c550 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 11:24:42 -0500 Subject: [PATCH 267/307] try cosine uncommented but False --- zoobot/pytorch/training/finetune.py | 62 ++++++++++++++--------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 5608fa43..7434ddaa 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -253,37 +253,37 @@ def configure_optimizers(self): opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict logging.info('Optimizer ready, configuring scheduler') - # if self.cosine_schedule: - # # logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) - # # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy - # # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers - # # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - # # lr_scheduler = CosineWarmupScheduler( - # # optimizer=opt, - # # warmup_epochs=self.warmup_epochs, - # # max_epochs=self.max_cosine_epochs, - # # start_value=self.learning_rate, - # # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, - # # ) - - # logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) - # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - # optimizer=opt, - # T_max=self.max_cosine_epochs, - # eta_min=self.learning_rate * self.max_learning_rate_reduction_factor - # ) - - # # lr_scheduler_config default is frequency=1, interval=epoch - # return { - # "optimizer": opt, - # "lr_scheduler": { - # 'scheduler': lr_scheduler, - # 'interval': 'epoch', - # 'frequency': 1 - # } - # } - # else: - # logging.info('Learning rate scheduler not used') + if self.cosine_schedule: + logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) + from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + lr_scheduler = CosineWarmupScheduler( + optimizer=opt, + warmup_epochs=self.warmup_epochs, + max_epochs=self.max_cosine_epochs, + start_value=self.learning_rate, + end_value=self.learning_rate * self.max_learning_rate_reduction_factor, + ) + + # logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) + # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + # optimizer=opt, + # T_max=self.max_cosine_epochs, + # eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + # ) + + # lr_scheduler_config default is frequency=1, interval=epoch + return { + "optimizer": opt, + "lr_scheduler": { + 'scheduler': lr_scheduler, + 'interval': 'epoch', + 'frequency': 1 + } + } + else: + logging.info('Learning rate scheduler not used') return opt From eaa98cea385836e4470346b5f2500f2ec825ff9d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 11:31:36 -0500 Subject: [PATCH 268/307] add a warning --- zoobot/pytorch/training/finetune.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 7434ddaa..0d277a75 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -112,14 +112,13 @@ def __init__( # self.encoder_dim = 9216 else: self.encoder_dim = define_model.get_encoder_dim(self.encoder) - self.n_blocks = n_blocks - logging.info('Blocks to finetune: {}'.format(n_layers)) # for backwards compat. if n_layers: logging.warning('FinetuneableZoobot(n_layers) is now renamed to n_blocks, please update to pass n_blocks instead! For now, setting n_blocks=n_layers') self.n_blocks = n_layers - logging.info('Layers to finetune: {}'.format(n_layers)) + else: + self.n_blocks = n_blocks self.learning_rate = learning_rate self.lr_decay = lr_decay @@ -243,6 +242,8 @@ def configure_optimizers(self): logging.info('param groups: {}'.format(len(params))) + + # because it iterates through the generators, THIS BREAKS TRAINING so only uncomment to debug params # for param_group_n, param_group in enumerate(params): # shapes_within_param_group = [p.shape for p in list(param_group['params'])] # logging.debug('param group {}: {}'.format(param_group_n, shapes_within_param_group)) @@ -250,6 +251,7 @@ def configure_optimizers(self): # print(list(param_group['params']) for param_group in params) # exit() # Initialize AdamW optimizer + opt = torch.optim.AdamW(params, weight_decay=self.weight_decay) # lr included in params dict logging.info('Optimizer ready, configuring scheduler') From 6b754b93fdfdfdca7e3293c3fca15f2aedeeb6f6 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 2 Mar 2024 12:05:26 -0500 Subject: [PATCH 269/307] torch cosine --- zoobot/pytorch/training/finetune.py | 33 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 0d277a75..744e4b43 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -256,26 +256,25 @@ def configure_optimizers(self): logging.info('Optimizer ready, configuring scheduler') if self.cosine_schedule: - logging.info('Using cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) - from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy - # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers - # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - lr_scheduler = CosineWarmupScheduler( - optimizer=opt, - warmup_epochs=self.warmup_epochs, - max_epochs=self.max_cosine_epochs, - start_value=self.learning_rate, - end_value=self.learning_rate * self.max_learning_rate_reduction_factor, - ) - - # logging.info('Using cosine schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) - # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + # logging.info('Using lightly cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) + # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy + # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + # lr_scheduler = CosineWarmupScheduler( # optimizer=opt, - # T_max=self.max_cosine_epochs, - # eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + # warmup_epochs=self.warmup_epochs, + # max_epochs=self.max_cosine_epochs, + # start_value=self.learning_rate, + # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, # ) - # lr_scheduler_config default is frequency=1, interval=epoch + logging.info('Using CosineAnnealingLR schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) + lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer=opt, + T_max=self.max_cosine_epochs, + eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + ) + return { "optimizer": opt, "lr_scheduler": { From c14637ee999ae3daf180c13b873c966e95e6d956 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 5 Mar 2024 15:26:40 -0500 Subject: [PATCH 270/307] remove mae --- zoobot/pytorch/training/finetune.py | 39 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 744e4b43..098cd3ee 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -16,9 +16,6 @@ import torchmetrics as tm import timm - -from foundation.models.mae_lightly import CustomMAEEncoder - from zoobot.pytorch.training import losses from zoobot.pytorch.estimators import define_model from zoobot.shared import schemas @@ -159,24 +156,26 @@ def configure_optimizers(self): *batch norm layers may optionally still have updated statistics using always_train_batchnorm """ + - if isinstance(self.encoder, CustomMAEEncoder): - logging.info('Using custom optimizer for MAE encoder') - # equivalent to usual, but in param_group format - head_param_groups = [ - {'params': self.head.parameters(), - 'weight_decay': self.weight_decay, - 'lr_scale': 1. # no lr decay on head - } - ] - # now custom bit for the encoder - encoder_param_groups = self.encoder.get_param_groups(self.weight_decay, self.lr_decay) - n_param_groups_to_tune = self.n_blocks * 2 # finetune top N. First layer is pos embedding, then pairs of decay/no decay, 18 pairs by default - if n_param_groups_to_tune > len(encoder_param_groups): - logging.warning('more param groups (blocks*2) specified to finetune than available') - encoder_param_groups_to_tune = encoder_param_groups[-n_param_groups_to_tune:] - param_groups = encoder_param_groups_to_tune + head_param_groups - return torch.optim.AdamW(param_groups, lr=self.learning_rate) + # from foundation.models.mae_lightly import CustomMAEEncoder + # if isinstance(self.encoder, CustomMAEEncoder): + # logging.info('Using custom optimizer for MAE encoder') + # # equivalent to usual, but in param_group format + # head_param_groups = [ + # {'params': self.head.parameters(), + # 'weight_decay': self.weight_decay, + # 'lr_scale': 1. # no lr decay on head + # } + # ] + # # now custom bit for the encoder + # encoder_param_groups = self.encoder.get_param_groups(self.weight_decay, self.lr_decay) + # n_param_groups_to_tune = self.n_blocks * 2 # finetune top N. First layer is pos embedding, then pairs of decay/no decay, 18 pairs by default + # if n_param_groups_to_tune > len(encoder_param_groups): + # logging.warning('more param groups (blocks*2) specified to finetune than available') + # encoder_param_groups_to_tune = encoder_param_groups[-n_param_groups_to_tune:] + # param_groups = encoder_param_groups_to_tune + head_param_groups + # return torch.optim.AdamW(param_groups, lr=self.learning_rate) lr = self.learning_rate params = [{"params": self.head.parameters(), "lr": lr}] From 477c0757578788d69749ee4101b031b263383640 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 5 Mar 2024 15:41:10 -0500 Subject: [PATCH 271/307] remove a little more --- zoobot/pytorch/training/finetune.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 098cd3ee..13b0e223 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -104,11 +104,11 @@ def __init__( self.encoder = encoder # TODO read as encoder property - if isinstance(self.encoder, CustomMAEEncoder): - self.encoder_dim = 256 # vit hidden dim, assuming average pool over seq dim - # self.encoder_dim = 9216 - else: - self.encoder_dim = define_model.get_encoder_dim(self.encoder) + # if isinstance(self.encoder, CustomMAEEncoder): + # self.encoder_dim = 256 # vit hidden dim, assuming average pool over seq dim + # # self.encoder_dim = 9216 + # else: + self.encoder_dim = define_model.get_encoder_dim(self.encoder) # for backwards compat. if n_layers: From d1855051d4d7b160928b2836a1c4d17e46c80b12 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 8 Mar 2024 17:31:16 -0500 Subject: [PATCH 272/307] minor cleanup --- README.md | 7 +- zoobot/pytorch/training/debug_split.ipynb | 184 ------- zoobot/pytorch/training/finetune.py | 83 +--- zoobot/pytorch/training/temp.ipynb | 569 ---------------------- 4 files changed, 8 insertions(+), 835 deletions(-) delete mode 100644 zoobot/pytorch/training/debug_split.ipynb delete mode 100644 zoobot/pytorch/training/temp.ipynb diff --git a/README.md b/README.md index b52d708a..f74e7e1b 100755 --- a/README.md +++ b/README.md @@ -133,15 +133,18 @@ CUDA 12.1 for PyTorch 2.1.0: conda activate zoobot39_torch conda install -c conda-forge cudatoolkit=12.1 - ### Recent release features (v2.0.0) -- New pretrained architectures: EfficientnetV2 S/M/L, MaxViT tiny/small/base, ViT tiny/small, and more. +- New pretrained architectures: ConvNeXT, EfficientNetV2, MaxViT, and more. Each in several sizes. - Reworked finetuning procedure. All these architectures are finetuneable through a common method. +- Reworked finetuning options. Batch norm finetuning removed. Cosine schedule option added. +- Reworked finetuning saving/loading. Auto-downloads encoder from HuggingFace. - Now supports regression finetuning (as well as multi-class and binary). See `pytorch/examples/finetuning` - Updated `timm` to 0.9.10, allowing latest model architectures. Previously downloaded checkpoints may not load correctly! - (internal until published) GZ Evo v2 now includes Cosmic Dawn (HSC H2O). Significant performance improvement on HSC finetuning. Also now includes GZ UKIDSS (dragged from our archives). - Updated `pytorch` to `2.1.0` +- Added support for webdatasets (only recommended for large-scale distributed training) +- Improved per-question logging when training from scratch - Added option to compile encoder for max speed (not recommended for finetuning, only for pretraining). - Deprecates TensorFlow. The CS research community focuses on PyTorch and new frameworks like JAX. diff --git a/zoobot/pytorch/training/debug_split.ipynb b/zoobot/pytorch/training/debug_split.ipynb deleted file mode 100644 index 954cb61a..00000000 --- a/zoobot/pytorch/training/debug_split.ipynb +++ /dev/null @@ -1,184 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import webdataset as wds\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "world_size: processes participating in job (e.g. 4)\n", - "\n", - "rank: index of current process (e.g. 2)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['WORLD_SIZE'] = '4'" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['5', '9', '3']\n", - "['6', '2', '7']\n", - "['1', '4']\n", - "['8', '0']\n" - ] - } - ], - "source": [ - "for rank in range(4):\n", - " os.environ['RANK'] = str(rank)\n", - " print(list(wds.split_by_node({str(x) for x in range(10)})))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def nodesplitter_func(urls, node_id, node_count): # SimpleShardList\n", - " # print(urls)\n", - " # try:\n", - " # node_id, node_count = torch.distributed.get_rank(), torch.distributed.get_world_size()\n", - " urls_to_use = list(urls)[node_id::node_count]\n", - " print(f'id: {node_id}, of count {node_count}. \\nURLS: {len(urls_to_use)} of {len(urls)} ({urls_to_use})\\n\\n')\n", - " # except RuntimeError:\n", - " # # print('Distributed not initialised. Hopefully single node.')\n", - " return urls_to_use\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id: 0, of count 4. \n", - "URLS: 3 of 10 (['5', '9', '3'])\n", - "\n", - "\n", - "['5', '9', '3']\n", - "id: 1, of count 4. \n", - "URLS: 3 of 10 (['6', '2', '7'])\n", - "\n", - "\n", - "['6', '2', '7']\n", - "id: 2, of count 4. \n", - "URLS: 2 of 10 (['1', '4'])\n", - "\n", - "\n", - "['1', '4']\n", - "id: 3, of count 4. \n", - "URLS: 2 of 10 (['8', '0'])\n", - "\n", - "\n", - "['8', '0']\n" - ] - } - ], - "source": [ - "for rank in range(4):\n", - " os.environ['RANK'] = str(rank)\n", - " print(nodesplitter_func({str(x) for x in range(10)}, rank, 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3, 4, 0, 1)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wds.utils.pytorch_worker_info()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['5', '2']" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from itertools import islice\n", - "\n", - "def get_per_worker(urls={str(x) for x in range(10)}, worker_n=1, num_workers=5):\n", - " for s in islice(urls, worker_n, None, num_workers):\n", - " yield s\n", - "\n", - "list(get_per_worker(worker_n=0))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 13b0e223..37d79c8d 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -70,7 +70,7 @@ def __init__( weight_decay=0.05, learning_rate=1e-4, # 10x lower than typical, you may like to experiment dropout_prob=0.5, - always_train_batchnorm=True, + always_train_batchnorm=False, # temporarily deprecated prog_bar=True, visualize_images=False, # upload examples to wandb, good for debugging seed=42, @@ -104,10 +104,6 @@ def __init__( self.encoder = encoder # TODO read as encoder property - # if isinstance(self.encoder, CustomMAEEncoder): - # self.encoder_dim = 256 # vit hidden dim, assuming average pool over seq dim - # # self.encoder_dim = 9216 - # else: self.encoder_dim = define_model.get_encoder_dim(self.encoder) # for backwards compat. @@ -129,7 +125,8 @@ def __init__( self.always_train_batchnorm = always_train_batchnorm if self.always_train_batchnorm: - logging.info('always_train_batchnorm=True, so all batch norm layers will be finetuned') + raise NotImplementedError('Temporarily deprecated, always_train_batchnorm=True not supported') + # logging.info('always_train_batchnorm=True, so all batch norm layers will be finetuned') self.train_loss_metric = tm.MeanMetric() self.val_loss_metric = tm.MeanMetric() @@ -156,26 +153,6 @@ def configure_optimizers(self): *batch norm layers may optionally still have updated statistics using always_train_batchnorm """ - - - # from foundation.models.mae_lightly import CustomMAEEncoder - # if isinstance(self.encoder, CustomMAEEncoder): - # logging.info('Using custom optimizer for MAE encoder') - # # equivalent to usual, but in param_group format - # head_param_groups = [ - # {'params': self.head.parameters(), - # 'weight_decay': self.weight_decay, - # 'lr_scale': 1. # no lr decay on head - # } - # ] - # # now custom bit for the encoder - # encoder_param_groups = self.encoder.get_param_groups(self.weight_decay, self.lr_decay) - # n_param_groups_to_tune = self.n_blocks * 2 # finetune top N. First layer is pos embedding, then pairs of decay/no decay, 18 pairs by default - # if n_param_groups_to_tune > len(encoder_param_groups): - # logging.warning('more param groups (blocks*2) specified to finetune than available') - # encoder_param_groups_to_tune = encoder_param_groups[-n_param_groups_to_tune:] - # param_groups = encoder_param_groups_to_tune + head_param_groups - # return torch.optim.AdamW(param_groups, lr=self.learning_rate) lr = self.learning_rate params = [{"params": self.head.parameters(), "lr": lr}] @@ -753,57 +730,3 @@ def get_trainer( ) return trainer - -# TODO check exactly which layers get FTd -# def is_tuneable(block_of_layers): -# if len(list(block_of_layers.parameters())) == 0: -# logging.info('Skipping block with no params') -# logging.info(block_of_layers) -# return False -# else: -# # currently, allowed to include batchnorm -# return True - -# def get_batch_norm_params_lighting(parent_module, checked_params=set(), batch_norm_params=[]): - -# modules = parent_module.modules() -# for module in modules: -# if id(module) not in checked_params: -# checked_params.add(id(module)) -# if isinstance(module, torch.nn.BatchNorm2d): -# batch_norm_params += module.parameters() -# else: -# checked_params, batch_norm_params = get_batch_norm_params_lighting(module, checked_params, batch_norm_params) - -# return checked_params, batch_norm_params - - - - # when ready (don't peek often, you'll overfit) - # trainer.test(model, dataloaders=datamodule) - - # return model, checkpoint_callback.best_model_path - # trainer.callbacks[checkpoint_callback].best_model_path? - -# def investigate_structure(): - -# from zoobot.pytorch.estimators import define_model - - -# model = define_model.get_plain_pytorch_zoobot_model(output_dim=1280, include_top=False) - -# # print(model) -# # with include_top=False, first and only child is EffNet -# effnet_with_pool = list(model.children())[0] - -# # 0th is actually EffNet, 1st and 2nd are AvgPool and Identity -# effnet = list(effnet_with_pool.children())[0] - -# for layer_n, layer in enumerate(effnet.children()): -# # first bunch are Sequential module wrapping e.g. 3 MBConv blocks -# print('\n', layer_n) -# if isinstance(layer, torch.nn.Sequential): -# print(layer) -# # so the blocks to finetune are each Sequential (repeated MBConv) block -# # and other blocks can be left alone -# # (also be careful to leave batch-norm alone) diff --git a/zoobot/pytorch/training/temp.ipynb b/zoobot/pytorch/training/temp.ipynb deleted file mode 100644 index e126a866..00000000 --- a/zoobot/pytorch/training/temp.ipynb +++ /dev/null @@ -1,569 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Relative Time (Wall)scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestampscratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtimescratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpuscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MAXscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpuscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MINscratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MAX
0NaN1.699053e+091.699053e+091.699054e+09630.15553260.1043591200.18725561.96605310.5390.93000
\n", - "
" - ], - "text/plain": [ - " Relative Time (Wall) \\\n", - "0 NaN \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp \\\n", - "0 1.699053e+09 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MIN \\\n", - "0 1.699053e+09 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _timestamp__MAX \\\n", - "0 1.699054e+09 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime \\\n", - "0 630.155532 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MIN \\\n", - "0 60.104359 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - _runtime__MAX \\\n", - "0 1200.187255 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu \\\n", - "0 61.966053 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MIN \\\n", - "0 10.53 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.0.gpu__MAX \\\n", - "0 90.93 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu \\\n", - "0 0 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MIN \\\n", - "0 0 \n", - "\n", - " scratch, batch 256 11 workers - 1 gpu, 2 reserved - system/gpu.1.gpu__MAX \n", - "0 0 " - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv('/home/walml/Downloads/wandb_export_2023-11-07T13 52 16.196-05 00.csv')\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# how annoying, it doesn't export the values themselves\n", - "\n", - "# transcribe manually\n", - "# https://wandb.ai/jbca-ice/narval?workspace=user-walmsley" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "data = [\n", - " (0., 0.), # by definition\n", - " (1.002, 32.4),\n", - " (1.502, 65.93),\n", - " (2.002, 75.53),\n", - " (2.502, 69.6),\n", - " (3.002, 82.67),\n", - " (3.502, 90.93),\n", - " (4.002, 82.93),\n", - " (4.502, 90.97),\n", - " (5.002, 70.),\n", - " (5.502, 41.53),\n", - " (6.002, 16.93),\n", - " (6.502, 34.33),\n", - " (7.002, 55.13),\n", - " (7.502, 58.16),\n", - " (8.002, 87.53),\n", - " (8.502, 81.13),\n", - " (9.002, 80.73),\n", - " (9.503, 84.67),\n", - " (10.03, 87.27),\n", - " (11.003, 83.4),\n", - " (11.503, 35.6),\n", - " (12.003, 11.93),\n", - " (12.503, 20.67),\n", - " (13.003, 41.2),\n", - " (13.503, 53.2),\n", - " (14.003, 69.6),\n", - " (14.503, 88.67),\n", - " (15.003, 75.73),\n", - " (15.503, 83),\n", - " (16.003, 80.33),\n", - " (16.503, 89.2),\n", - " (17.003, 65.67),\n", - " (17.503, 11.4),\n", - " (18.003, 10.53),\n", - " (18.503, 40.87),\n", - " (19.003, 52.67),\n", - " (19.503, 75.8),\n", - " (20.003, 84.13)\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(data=data, columns=['time', 'utilisation'])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timeutilisation
00.0000.00
11.00232.40
21.50265.93
32.00275.53
42.50269.60
53.00282.67
63.50290.93
74.00282.93
84.50290.97
95.00270.00
105.50241.53
116.00216.93
126.50234.33
137.00255.13
147.50258.16
158.00287.53
168.50281.13
179.00280.73
189.50384.67
1910.03087.27
2011.00383.40
2111.50335.60
2212.00311.93
2312.50320.67
2413.00341.20
2513.50353.20
2614.00369.60
2714.50388.67
2815.00375.73
2915.50383.00
3016.00380.33
3116.50389.20
3217.00365.67
3317.50311.40
3418.00310.53
3518.50340.87
3619.00352.67
3719.50375.80
3820.00384.13
\n", - "
" - ], - "text/plain": [ - " time utilisation\n", - "0 0.000 0.00\n", - "1 1.002 32.40\n", - "2 1.502 65.93\n", - "3 2.002 75.53\n", - "4 2.502 69.60\n", - "5 3.002 82.67\n", - "6 3.502 90.93\n", - "7 4.002 82.93\n", - "8 4.502 90.97\n", - "9 5.002 70.00\n", - "10 5.502 41.53\n", - "11 6.002 16.93\n", - "12 6.502 34.33\n", - "13 7.002 55.13\n", - "14 7.502 58.16\n", - "15 8.002 87.53\n", - "16 8.502 81.13\n", - "17 9.002 80.73\n", - "18 9.503 84.67\n", - "19 10.030 87.27\n", - "20 11.003 83.40\n", - "21 11.503 35.60\n", - "22 12.003 11.93\n", - "23 12.503 20.67\n", - "24 13.003 41.20\n", - "25 13.503 53.20\n", - "26 14.003 69.60\n", - "27 14.503 88.67\n", - "28 15.003 75.73\n", - "29 15.503 83.00\n", - "30 16.003 80.33\n", - "31 16.503 89.20\n", - "32 17.003 65.67\n", - "33 17.503 11.40\n", - "34 18.003 10.53\n", - "35 18.503 40.87\n", - "36 19.003 52.67\n", - "37 19.503 75.80\n", - "38 20.003 84.13" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib.ticker import PercentFormatter" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYEAAAC5CAYAAADOB4NQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAABLiklEQVR4nO2dd1hTZ/vHvwkQIGHvGZYgshRlKlLFVUer4nrdWFe1Vm193VscrX21fdXW1trWgf21tsXVqnW+daIiQ0EQZe+9IYQk5/cHJZISIIGQAc/nurguPfM+T84593nuSaMoigKBQCAQeiV0RQtAIBAIBMVBlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL4YoAQKBQOjFECVAIBAIvRiiBAgEAqEXQ5RAL+Xw4cOYOXOmosXodZBx7z7I2HYOdUUL0FuZO3cuHj9+3Gr5wYMHMX78eAVI1DHXrl3DmTNnkJCQgJqaGiQmJkJdXbVuIVUc96+//hp//vknMjIywGKxMHToUKxduxZGRkaKFk0EVRzbY8eOITIyEvn5+dDS0oK3tzfWr18PBwcHRYsmN1TrCe5hzJ8/H4sXLxZZpqenpyBpOqa+vh4BAQEYPHgwDh48qGhxOo2qjfvTp08RFhYGDw8P1NbWIjw8HKtXr8apU6cULVorVG1sbW1tsW3bNtja2qK2thaHDx/G0qVLce3aNUWLJjeIOUiBaGtrw9TUVORPU1MTwJup7fHjxxEYGAhfX18cPHgQLev9ZWRk4L333oOXlxcCAwPx6aefgsfjCdfX1dVh165dGDJkCLy8vDB58mTEx8eLyBAREYGgoCD4+/tj//79aK+e4MSJE7Fs2TIMGDBAtgMhZ1Rt3L/99ltMnDgRTk5O8PLywqZNm/Do0SNUV1fLeGS6jqqN7dixYzF48GDY2trC1dUVK1euRGZmJkpKSmQ8MsoLmQkoMcnJyTA2NsapU6eQmpqKTZs2wd7eHqGhoeDz+Vi+fDnYbDZ++eUXFBQUYOPGjdDX18f7778PANi6dSsSExPx6aefgs1mIykpCQKBQHj8ly9fgs1m4+TJk0hPT8fq1avh4+ODkJAQRV2yUqDs415eXg5NTU1oa2t3y/V3J8o8thwOB+fOnYODg4PSmdq6FYqgEObMmUO5u7tTAwYMEPnLysqiKIqiDh06RHl5eVEVFRXCfQ4ePEhNnjyZoiiK+uuvvyhPT0+qvLxcuP7HH3+k/P39KYqiqKysLMrFxYV69uyZ2PMfOnSI8vX1pTgcjnDZe++9R33yyScdyh4VFUW5uLhQjY2NUl+3olHlcacoimpoaKBCQ0OprVu3SnXd8kBVx/bWrVvUgAEDqL59+1JjxowRyttbIDMBBTJt2jSEhYWJLLOwsBD+m81mQ19fX/h/Ly8voR04PT0ddnZ2MDAwEK739vZGeXk5Kioq8OrVKzCZTHh6erZ5fjs7O+FUHQBMTExQWlraxatSflR13Pl8Pv79738DANavX9/h9opAFcfW398f58+fR3FxMX744QesWbMGZ86cgYaGhiSXrPJIrQTi4uIQExODvLw8cDgcGBoawtXVFf7+/jAxMekOGXssenp6sLOza3M9jUZrcx3VQS+gjtYDaBXZQ6PRwOfzO9xP1VHFcRcIBNiwYQPS09Nx+vRpsFisDs+jCFRxbJlMJuzs7GBnZwcvLy/4+fnhzp07GDFiRIfn6wlI5Biur6/H0aNHMXz4cISFheHatWsoLCwEh8NBcnIyDhw4gGHDhuH9999HbGxsd8vca8jMzERVVZXw/8+fPxeGrjk6OiIzMxMVFRXC9bGxsTAyMoKBgQFcXFxQV1eH58+fy1tslUfZxp2iKGzevBnx8fH4/vvvRb6UVQ1lG1txUBSlcqHPXUGiKx0zZgwGDx6M3bt3IyAgAGpqaq22KSwsxJUrV7Bx40bMmzcPs2bNkrmwPY36+noUFxeLLNPR0RE6/Oh0OrZs2YKVK1ciLS0Np0+fxqZNmwAAQUFBsLGxwYYNG/Dxxx8jPz8fhw8fxvz58wE0hb5NmDABa9euxdatW8Fms/Hy5UuYmJh0OrqnoqIC+fn5yMrKAtDk5FNTUwObzVbaL1NxqNq4b9++Hbdv38axY8cAQCi7kZGR2GdRkaja2H722WcYOXIkzMzMUFpaimPHjsHQ0BADBw7s/CCoGBIpgZ9++glWVlbtbmNubo6wsDDMnz8fRUVFMhGup3Py5EmcPHlSZNm6deuwcOFCAICrqys8PDwwe/Zs8Pl8zJw5E6GhoQCaHqavvvoKO3fuxNSpU8FisTBp0iQsWrRIeKzw8HB8+umn+Pjjj8HhcODk5IQdO3Z0Wt5bt25h48aNwv9PmTIFAHDq1Cn4+/t3+rjyRtXG/eeffwbQZG9vyc2bN2FjY9Pp43YHqja2+fn5WLVqFcrKymBkZAQfHx+cOHECurq6nT6mqkGjJDG0EeTO4cOH8eDBA/zf//2fokXpVZBx7z7I2ConnTZ8VVVV4dChQ4iJiYFAIED//v3x4YcfEucwgUAgqBCdzhhev3499PX1sX//fuzbtw98Ph+rV6+WoWgEAoFA6G4kNgft2rULH3/8MXR0dAA0OYsvXLgALS0tAE2Zem0VkCIQCASCciKxOcjW1hahoaFYvnw5Jk2ahKlTp2L69OkIDg4Gj8fDtWvX8K9//as7ZSUQCASCjJHKMVxYWIhPPvkEhYWF2LZtGwQCAaKjo4U+AW9v7+6UlUAgEAgyRiqfgLm5OT7//HOsWLECa9aswblz5xAaGoqwsLBOKYCIiAiEhobCw8MDH330kci6lJQUTJ8+Hf3798eECRMQHR0tsv7q1asYMWIEBgwYgPfeew+FhYXCdZcuXcKQIUMQEhKCR48eCZdXVlYiNDQUtbW1UstKIBAIPRGplEB5eTmeP38ODw8PnD9/HoaGhpg0aRIuXbrUqZObmZlh+fLlmD59usjyxsZGLFu2DCNHjsSTJ0+wePFiLF++HJWVlQCA1NRUbNy4EeHh4YiKioKdnR3WrFkDAODxeAgPD8epU6ewfft2hIeHC4/7n//8B8uWLVOpxCYCgUDoTiT2Cfz222/Yt28fHB0dkZeXh82bN2P58uV45513sGfPHpw9exbbt29Hnz59JD756NGjAQBJSUkoLy8XLn/8+DE4HA4WLVoEOp2OiRMn4uTJk7h27RqmTZuGixcvIjg4GIMHDwYArFq1CkOGDEFWVhaYTCYYDAacnJxga2uL7OxsAE3p5yUlJRg1alSHchUVFbXKegSawmJTU1Ph5uYmUqSKQCAQ5E1DQwNycnIQFBTUpdLXEiuBzz//HF9++SX8/f2Rm5uLJUuWYOzYsbC1tcXXX3+NGzdu4IMPPsCff/7ZaWGaefXqFVxcXECnv5mouLq64tWrVwCaTEVeXl7CdQYGBrC0tERKSoqwbnhKSgry8/PRp08f8Hg8fPrppzhw4IBE5//5559x5MiRLl8HgUAgdDefffYZ3n333U7vL1WyWPNLWU1NTaSRAwCMHDkSQUFBnRakJbW1ta3StvX09ISdlOrq6sSur62tBZ1Ox2effYYtW7aAwWBg9+7dOHnyJEaOHInq6mosXLgQXC4XH374Ifz8/MSef8aMGWKbUCQlJWHz5s347LPP4OTkJJNrJRAIhM6QmpqKtWvXdrl0iMRKYPXq1Vi+fDmcnJyQk5ODDRs2tNqmOWegq7BYLNTU1Igsq66uFtrymUxmu+sDAwMRGBgIAMjLy8P169cRERGBf/3rX9i4cSPMzc0xZ84c3L59W2xpWzMzM5iZmbUpn5OTE9zd3bt0jQQCgSALumqallgJTJ06FSEhIcjJyQGbze7WcrbOzs44fvw4BAKBcPaRlJSEmTNnAgBcXFyQnJws3L6yshL5+flwcXFpdaw9e/Zg3bp1UFdXR0pKCjw9PcFgMMDj8VBWVgZjY+Nuuw4CgUBQdqSKDjIyMoKXl5fMFACPx0NDQwN4PB4EAgEaGhrQ2NgIPz8/MBgMfP/99+Byubh06RJycnKETt13330Xd+7cwcOHD8HhcHDo0CEMGDAAbDZb5Pg3b96EsbGxsCysjY0NoqKi8Pr1a3C5XJWuy04gEAgyQZIelAsWLKCioqI63K6srIz68ssvqdOnT0vU2/LQoUOUi4uLyN/69espiqKo5ORkaurUqZSnpyc1btw46vHjxyL7Xr58mQoJCaG8vLyoBQsWUAUFBSLra2trqdDQUJF+pg8ePKCGDx9ODRkyhPr9998lkrElCQkJlIuLC5WQkCD1vgQCgSBLZPU+kihj+MqVK/jvf/+LhoYGDB06FG5ubjA1NQWDwUB1dTXS0tIQExOD2NhYvPvuu1ixYgVMTU3locPkSmJiIkJDQxEZGanSPgFOIx9pxTWIziyHr70RHExY0NKQrDlJV/YlEAiyQ1bvI4l8AmPHjsXYsWNx//59/Pnnnzhz5gzy8vKEJhVXV1cEBwfjs88+IzZ2JYfTyMfmc8/xW0yucNmUQdbYM8mzw5d5V/YlEAjKiVQhokOGDMGQIUO6SxaCHEgvqRV5iQPAb09zsSjIEf0s9bptXwJB0ZBZrHh6TzdlGSIQCMAX8BUthtTQQMOTjDKx66IzytDXXAcUxFsHu7KvMkPDmxBhecmviHMqE418tHoZd/e7uJGPNmexqqoH/pmr1VmIEugEUblRyNXO7XhDJcNCxwIDbNli1w1g6+FZ0TMU1BSIXW/OsoCHdef2VUb0NPXgaTYQ6SV1iM2sxEA7fdibMPG8KAZVDVU95pzKBJ1GR6BNMHZcSBZ9GQ+0xo6JrniYcwcCSjYvtpZY6FhAjc8WO4sNG8wGXy1Lpe7dZrJzs2VyHKIEOoEuQxdGWp2v1aEoMkvqEchmYspA61YPob0xC8kljWKvi6IofHOrHDvfcRW7r6MJCy+Kxe+rjNBAg5upF7ZdSGp1LbsmDsKL4mcy/0JXxDmVDSNtI2SW1Ld+GcfkYkGQHRwNHFFWL3622RWMtY1xO7FS7LrYrEoMdzMGl8eV+Xm7mwpGhUyOQ5RAJ6CDDjW6as0hK+r42HOxEha6T3B6kR/eC7LD08wKDLA1gLmeNlb+9BTzhmpAT7v1dZ19VIOrz+ugrRGH8InemD+4aV9vtgGczVjIqkoVqfOk7BhqGSK9tE7sy+i9IDsYM41RzilvY+/2oYEGAy0DaKprooHXgApOBShQTecsEX/OBV08p6qgraGNBy8rxK6LyazACHdtqDXI/rni8rnwtDEUu26QnQEa+eUq9zw332eyQHWeXEKn4TQKsPdiOYqq+BjiCqSWJ0Jbqxwj3BnQYZbju4dPcDu5HHsvlqOhsemFZaFjAUMtQ9x8UY+fH9XA0VQdU/zUkFKaCKZ2GWoauPgtJhPJJc9R11in6EuUCk11TcRkVIhd9zSzAprqnUvDZ2ow0dfYA3X1hriZwEUdxxB9jd2RXaKOuEwKT9o455P0CjxJE+BmYh1Kq8X7mmigifwuLf0KqkIDrwED7Q3ErhtkZ4AGXkOHx+jMOPzfozxYGTTNgFvSPAOu4FRIIr7S0HyfNTToy+R4nZoJpKSk4MmTJygrK2vlnFi1apVMBCN0jeYvBQ01TfzyuAw5ZQJM9WNhpDsTFCiRr84xnkyUVPORkq8GF2MP5FfW415mBQawDTHfzwqJWTGYF6wObQYdFChUNlQgqbAKl+Pr4GVnDEczDQVeqfQ0vYzEfxkOZBuggSf9FzkNNLD1nLDtwotW5p51b7vikyvJmBMg3qfS30YfP9zPwKVnTX4BGyN1DGAzMICtCTdrBgyZLLD1nJBeUot7mRUYaG+IvsY2yKpKVSkFXMGpQF9jG7EmRTtjJlJK09vdn6nBlHoc4jIb8P2dKqQVxuDQzEEIG2yH6H/MYlXJDNfyPou89QwMGRxTaiVw4sQJfPLJJ7C3t2+VECauGBtB/rR8WKIzKhDsbI35gc4orEtDPa9e7D6zA3Vhb+CBvX+0tlkfmTUIr8oSRR6WIBctXI6vw92UepVTAk0vI2uxLyNzPW3E5r+GiY6aWLNOWxhoGSC9VEwIbUwu5gbaYc5gHbia6Yg9Z18LHUzxF8DJUh/xWVzEZzXg97g6/B5XByaDjjvrPMQql10T3fCyNEFlXmIUKKSVv8b6t10xy5+NxLxKoTny8O1nGO3VtmGiPSXb1jjkVfBw8GoFtBk0TP57FqvP1EdtAw+RMZmY5CtQmbFrpuV9Jqu3rdRK4LvvvsOuXbtadQMjKAedeVgAwFDbEPkVbdvJDbQMRGYPLhYaMNWl40EKB3OH6IKuQh8AFCjcT0/BurddMTfQDs9yKjDQzgDWBkzM/e4x9LXo+HquDzJL68R+cYqz+3O4GniSXiH2fM9ymuzdWVWp2DXRTeiPGWRnAHvjpq9RXW0ahvbVxtC+2qAoCtllPMRlcqHLMEBGG74Ecb+LsnP3VTn+++dt7JxkhxHu+uDwyrDgRDpeFjTAw9YEVobiX0ntKdnmcajgVAh/lyoOBwcvp6KeS2Hzu4awMlAHBQoVnAo8y6/Etef1GORkAhsj1XKLaqpr4l4bZsXOIrVPgMvlwt/fX6ZCEGRHew9LRmltm84kTXVNxGRWiF0nzk5Oo9EwxEUbJTUCvMxvlIXocuX+63IE77+N8voijHBngKlVjvSKRLzdn46DMwZhx8UkTDj8ANsuvsCEQw+w7cILsPWcWtn9a+sNYafvjj/iauFpLd5G22zvrmusw8vSBKE/RlurHC9LE1qZMmg0GtjGGnh3IAszA4zxIk98ZEtX/BeK4taLejTyBTDSq0FBTQEqOBWYPYQFvgA4db+6zf3a8+NEZ1SAqcES+V3qOYb4aUkw1o2zxAA70THyc2gqeR+dzpHZdckLDo8DTxsDmR5TaiUwc+ZM/PrrrzIVgiA7Ouv07IzTLsil6WG6l6J6D1NiLheNfAE0GFUoqClAOaccFCiM87BEYZX4MMbUolrY6DbNspoVxDuHH2DvH0lYEGQPFwtWh87HZn9My3O2hyycqcpCeS0fMRkN6M9mwFjnTTSOqyUDgX008SStAYk54kM1G3gN6M82ELvOw1ofmnRDkd8l9KuH2H81GXP93Fs5jz1sGNDSoOFxmuqMXTNNTm7tVvdZV5B6LpSbm4vbt2/jf//7H5ydnaGuLnqI/fv3y0w4ZUUA5c0Yrm+sR39b8fWbBtoZoL6xVKzspXWlcDMVbye3N2biRfHrVi8sWyMarA3V8PBVPeYHMaFGVw2TkEBAISmPCyczdTDUKZHx0FDTQFxWhdj9yuoaQRW3EeY5xA76rBrsmtgPC4LsEJPZZGJyMGYivSIFPAGvU7J25ndRVv6XVAcBBQzrp9XqHpwZyMKTtAb8cLcK+6YbtDIv/vE8F9O9rcT7VMx18bKwus3wWz1NPZH8AzodGMDWwKNULkprGmHAVI0gyUepDfjurzdO7rdM67H2dtePK7USUFdXl6hZe0+mmluNMo7sk1pkQXFtOXyt2orA0MbDnLQ2szLji55ix8SBCBvCRmxmJbz/zmiNL3raZkart4MAv8dQeJhWBjcb1VAC2aUUahsoOJrzW/2ODHUGvO3ER/G4Wujgz8RCsetiMisQ0LcR93P+ghnLDAEuTNQ1ZuJ+TlGXs2Bb/i7RGZXob6MPZ4v2fxdlg6Io3HghAEsTcLSsQRmnVmQ9QxMY5k7Djec8XE0oRYDzmxdzdKoA3/2Pwv2UJzg217fV/Vlan4eEXPFjHJNRgYC+gla/cz8bAaJSgbuvyjCkr/IrgbQiCoeuCaCrDYT0r8SDnL/AU6vpeEcJkFoJ7Nu3TyYnVmUCrAPg5ugml3NJW2fm27vpOHrtCc4sCsCioY6IziiDT4v6LCMcRnR4DHdLA7hbGgjP6W/dtg/ISbcGv8fcQ36xJVYFe3Z8QUrAD/kZAJIx3Xsghjm2LnneyG+qK/PbU9E6M0YsLfg5iJ9l+ToYo4+hDpwMu6/3tLulAeKyqvDr0xyET3Jv93dRNuKyK1BQEYV5gWyMcxb/7PhZNiLk9R3ceqaODSOCwVCnIyarHGfuP4GZrjp2T+wPFkOj1f3J0rCFn4P4F2Jbv4uvBRen7t5GXrEJRo0dJNNrlRXNz35xTQN2nH0AdXojTi7wg5dNk+/pRd0LmZyn067x1NRUpKc3xfU6OjrC0dFRJgI14+3tLfL/hoYGBAcH4+uvvwYAhISEoKSkBGpqTbZFKysr/PHHHwCA5ORkfPzxxygrK8P777+PsLAw4XEWLlyIjz76CB4eHp2WjU6XT8ZwU9XDaomrHpbXcvHl7VToa2tAU52OfpZ63V7d09lcH+5WeriaWIjdkz2hqa78mZdPMspBpwF+DsZif0c1OrBnkicWBYkqUYY6HQ4mLLEKwsGEJZes6disCkTG5mLDuH7Q01Kd0Nxfn+YBAKb7sNt8doxYagif6A4/R2O8LKxBXHYF3K308NfaYeDyBLAxZLZ5fGl/F2MdbfjZG+He61I08CgwGcoVJdTy2few0seFFUOQXlILb/ab0iyyut+kvvLS0lKsW7cO9+/fh55e0wumuroaQ4YMwf79+2FkJJv6MbGxscJ/8/l8DBs2DGPHjhXZ5siRIwgODm6174EDBxAWFoZhw4bhnXfewfjx42FqaopLly7BwcGhSwpAXnSmdv+hW69QzeFhz2T51vd/t78V9l1Jxp2UEoxyM5fbeTuDQEDhcXoZPKz1odvOS1RLQ02sEtXSUBOrIOQ13o6mLABAenEt+tsayOWcXaWey8fv8Xlws9SDRxsRVM2Mcrdofd8PtMaeye3PMjvzu4xyM8fDtFLce1WC0e4W0l1UNyL22ZdgDDqL1Kpk586dqKmpweXLl/H48WM8fvwYv//+O6qrq7Fz587ukBF3795FXV0dxowZI9H2OTk58PX1hZmZGezt7ZGfn4/q6mr88MMPWL16dYf7FxUVITExsdVfampqF69Ectqq3Z9eUtvm9qcfZqK/jT7e8bKUh4hCJvS3AgBcjM+T63k7w6uiGpTXNcLfofMfK80KYm6gPfpZ6slV4Tqa6gAA0kpkYw+WB38mFqC6gYdpPjYdbiv2vo9p+75vibS/S/MHy/UX4v08iqIrY9AZpJ4J3L17Fz/++KOI+cfJyQnbtm3D3LlzZSpcM5GRkRg/fjy0tLRElm/YsAECgQDOzs5YvXo1Bg1qsu05OzsjKioKLBYLubm5sLW1xYEDB7BkyRLo6Oh0eL6ff/4ZR44c6ZZrkZT2aveLM/Hsv5oMnoDCpnH95J65bW2gDR87Q9x4UYg6Lk/pptYtiUorBQD4t2HbV3YcTJpmAmnF3fNC6A7ORmdDQ42GiQM6DmuU9r7vCrZGTLha6OJWchH4AkppotvkOQZAJ5SAhoYG6upa1+mor69vFS4qC8rKynDr1i2cOXNGZPn+/fuFZp3IyEgsXrwYly5dgrW1NdavX48dO3bg7Nmz2LBhA7Kzs1FQUIDAwECsWbMGBQUFGDt2LObMmSP2nDNmzEBISEir5ampqVi7dq3Mr1EcA9nia9uIMwFEZ5ThSkIBRruZw99RMS+3d/pb4XluJV7kV8HHTnlLSj9KLwWNBvh2YSagSBxMWKDRVEcJZJfV4UFqKcZ6WMCI1XGlG1978b+LTxvLu8ooN3McvvUaMVnlbZ5b3sh7DKQ2B40aNQqbNm3C3bt3UVVVhaqqKty5cwdbtmzB6NGjZS7gpUuXYGdnh/79+4ss9/HxgZaWFrS0tDBr1iy4ubnhzp07AABra2t8++23OHfuHMaOHYt9+/Zh8+bN+Oabb9CnTx/88MMP+Omnn/D69Wux5zQzM4O7u3urPyen7ov8aElRFQdmeppiE4/M9bSw42ICGvlNIXEUReGvlGIwGWrYMNZVLvKJ493+VrizbjjoNBpOPcxAUn4VOI3KlUtBUU3+ADdLPehrq45TtSVaGmqw0tdGWjeZBmRNVlkd3vGyxL98bSXavtnB25JmB293oIwmIQcTMUmH3TgGUn+6b9myBXv37sWyZcvA5zc95GpqaggNDcWGDRtkLmBkZCRCQ0M73I5Go4GiWodQnj59GsOHD4etrS1SUlIQFhYGBoMBFxcXpKSkoE+fPjKXuSvw+AJ8+H+xqOXy8H+LRcM87Y1Z2P3HC8TnVKCqvhEFVRxEZ5TjLRdTvBfkAEOmLGoKdg5thhp2n3shlSNb3qQW16Ckhot3+8su21IROJqy8CSjDAIBBbqSmDD+SXM/35TCaoQNsYe7lWRlj+XtePe01oe5niauvyjExrGuSlEEU0tDDZvG9cNMPzZe5Fd1ez9kqZWAtrY2wsPDhWYWALC1tQWLJXstlZiYiNevX2PixIkiy/Py8pCXlwcvLy8AwPnz55GQkIA9e/aIbFdYWIgrV64gIiICAGBjY4NHjx7B29sbCQkJWLBggcxl7ipf3HiFR+llmB9oB10tDfSz1BCxA+541x01DTzs/r11gTh5RwW1RBWa0EelNdla/R2VY9rfWZxMdXD3VQnyqziwNtBWtDit6ExkW0vaiszqDmg0Gkb2M8eZR1lILa5FH7OOfYbdTRWnEcGf3cYHw/pg+fDu/0jtdKApi8WCq6srXF1du0UBAE2zgLfeegsmJiYiy+vq6hAeHg5/f38EBQXh3Llz+Prrr8Fmi2Z67tmzB2vXroWGRtPUf+nSpYiKisKwYcMQEhICT0/lSm7638siHLn9Gl42+tg0vp/YbTTU6Cio5Mg1ekAS2nNmKQuP0ptk8VMS229neeMcVs4IIWkj2xSNspmEbicXobaBD32mfEyWEs0EZs+ejaNHj0JPTw+zZs1qd8r0TwduV9i6davY5X369MGFCxc63P/QoUMi/7e0tMQvv/wiE9lkTTWnEVcTCmCsw8CXswa2m3Ql7+gBSZC3M0taKIrCo7RSuFrowlACB6Uy05wrkFZci6HOrTOeFY0y3p/tEehkDCMWA3XcztV3kjXXXhSCRgNG9ZNPzo1ESiAwMFD4NR0YGKgUdrOeQrPt9FF6GaYOssHm8f3aTWIClPOF217GpjKQUVqHouoGjPVQnqSgztKcK6CsX9bKeH+2h6a6Gm58/BZSi2tw6mFGt9vg26OBx8f/kovgbWsAMz2tjneQARIpgRUrVgj//eGHH3abML2NztpOlfGF2+zQWzDYAU8yyjDIzhAu5rpK4xR+1JwfoKAQWlliqacFLQ06UpXUHNQc3fLP+1pZPgj+CaeRjz1/KEdQw4PUUtRy+XLNYJbaJzBixAiUl7fuZFRVVYURIzouTkZ4Q2dtp80v3CurhiJ8ojuurBqqFFE4WhpqcDHXRXktF/delShcnpYI/QEqmh/QEjqdBgcTHaXNFbidXIR1b7vi4oohSnV/toUy+TCu/V2ldrQcy690qp/AP5vLA03JYkVFRTIRqrfQFdupPCMopIGhTkdUehkSciuxONgRGmqKL9NLURSi0krhbKYDEx3V6sTVFo4mLFxOyAenka9UL1eBgMJ/rr1EZX0j7q0PgZeMu2B1B8riwxAIKFx/UYg+ZjpCk588kFgJNJdRoNFo+O6778BkvqnoJxAIEB8fD2dnZ9lL2INRNduppAQ6GuNxehme5VRikJ34zGd5kl1Wj/xKDuYEiO8ToIo4mrJAUU1fscr0IfDXq2KkFtdiabCjUimn9lCW5zA2uwIlNQ2YLkGNJVkisRJ48OABgKavqujoaJESEerq6rC2tu6WZLGejKrZTiUl0MkY/735ClFppUqhBKLSVbtekDhaRggpkxL4/l461Og0zB9sr2hRJEZZfGzXEgsAQO4VTSVWAj/++CMAYOPGjdi8ebNEhdgI7aNOp2Hd266YE2CHhNxKuZcl7i682QbQVKfjYWopPpBDsktH6Giq4x0vS5VPEmuJo0lzhJDyOIdfFlTj7qsSTPCyhJUSJrG1RbOPbX6gPZ7+3bujj5mOXJ9DiqLwZ2IBLPS04NVBuW1ZI7XBdt++fUQByIiUwhoE77+N7LI6hZQl7i401dUwyM4Q0ZllaOAprn4Qp5GPF3mVKKziYMEQB5VqwtIRDqbKV030+3tNTaYWBjkoWBLp0dJQAw1AcXUDqjmNcn8OXxfVIKO0DqPczOVeCqRTZT/v3LmDP//8EwUFBWhsbBRZd+rUKZkI1huIzS5HA08AbSUuvdxZAh2N8SC1FPHZlQqJyOlq6QJlR09LAyY6mkhVklyBkpoGnIvLxUC2AbzbqICr7GhqqOGr/6WCTqMh0Mmk4x1kyLW/s5VHu8u/KZPUM4HTp09j9erVUFdXx6NHj8Bms0Gn05GYmKh0ZRiUnbisCgDAABXpECUNgU5N9vcHqSUKOb8yhf11F46mLKQV14gtnChvzkRlgcsTYGGQbNvMyhPbv9tXZpW1LpXf3VxLLICulrpC/FZSK4GIiAjs3bsXO3fuhIaGBpYsWYITJ05g4cKFqKqq6g4Zeyxx2RWwNtCGqW7PCFtsiZeNAZgMNTxMLe3W8zSbfFqWr27k8fE4Xfx5lamWUVdxMmWhmsNDSQ1XoXI08Pg4HZUJawNtjFHAl6ys0GaowVRXU+5KIL+yHvE5lQhxNQNDXf4h1VLbIQoLC4Vf/EwmE9XV1QCACRMmIDQ0FOHh4bKVsIdSxWnE6+IajPOUbytIecFQp8PH3ghRqaXdFsveVi/WbRPc2ixdrOrhty154xyuVeiHREphNQIdjeBtawh1JcgL6QpsIyYy5DxbvNFsCnJTTEkTqX8xKysrYVKYnZ0dbt++DQB4+vQpNDV73hdtd/EsuxIUBXj3QFNQM4GOxuDyBYjJbJ1hLgva6sWaVlwLV0s9uTYnUQRvwkQVEyHUPAuLzijH/MH2mNkD8jDsjJgoreWipkF+xeQ0NdQw2dsab/VVTDFAqWcCkyZNQnx8PLy9vbFkyRJ8+OGHOHXqFGpqaiRq4k5oIi676cXozTZQrCDdSLNf4GFaKQb3kb2jra1Mz4S8SnjbGcq1OYkiEJaUVoCfo6c63m2N/vYLlNbBzap78y84jXykFtegtoGHeYF2UFdQgyCplcCSJUuE/x42bBguX76MFy9egM1mo18/8TXwCa2Jy66AOp0mccclVcTDSg86murd5hfoKNNTWUtryApbIybU6TSFzARUoYlQZ2AbvXEOd6cSUCYl2mUDnqWlJaysrGBuLluH0IYNG+Dh4QFvb2/hX15ennB9SkoKpk+fjv79+2PChAmIjo4WrktOTsa4ceMQEBCAEydOiBx34cKFSEhIkKms0kJRFGKzKuBm1TPyAtpCXY0OPwcjxGVXdEutdnn3YlU2NNToYBszFZIroApNhDoD27hJCWR3s3NYmaLXpFYC27Ztw9mzZwEAjY2NmDFjBqZNm4bhw4fj7t27MhUuLCwMsbGxwj8rKyvheZctW4aRI0fiyZMnWLx4MZYvX47KykoAwIEDBxAWFoaLFy/i6NGjKC4uBtDUtN7BwQEeHh4ylVNacsrrUVrL7ZGhof8k0NEYPAGFJxmy9wvQAKx/2xWRywarRLXK7sDRRAdZZXVo5Lcu6tid+NopR70dWdNyJtCdKJMSlVoJ3Lp1C+7u7gCAGzduoKKiAg8ePMDKlSvxxRdfyFo+sTx+/BgcDgeLFi0Cg8HAxIkTYWNjg2vXrgEAcnJy4OvrCzMzM9jb2yM/Px/V1dX44YcflMJvEZtdAaBn5gf8E6FfoBtMQreSizB0/21U1HN7VMa1NDiZssATUN3+5fpPrAy1e+QszFRHE5rq9G5XAj5KpESl9glUVVXByKhJ0Dt37mDcuHEwMjLC2LFjhZVGZcXZs2dx9uxZWFhYYN68eZg6dSoA4NWrV3BxcQGd/kaHubq64tWrVwAAZ2dnREVFgcViITc3F7a2tjhw4ACWLFkiUcmLoqIi4eyhJampqTK5rtisZqewamZWSkM/Sz3oaanjYZrslcDF+Dxw+QK4WfZcv0pHvOk3XCu38sNFVRys+DEWX84e2OR4z+w5jnc6nQa2EbPblaqVgZbSFI+UWglYW1sjPj4eBgYG+Ouvv4Rf/xUVFdDSkl07tLlz52LdunXQ19dHdHQ0Vq5cCV1dXYwZMwa1tbXQ1dUV2V5PT0+Ys7B+/Xrs2LEDZ8+exYYNG5CdnY2CggIEBgZizZo1KCgowNixYzFnzhyx5/75559lrtBaEpddAQOmBuyNmR1vrOKo0WkIcDTGzeQiVHMaO2ydKSnVnEbcTC6Cn70RLPTl04ZPGWl+8aeV1ACQT6LWnstJeJxRhmc5FRjRzxz9ujmKRt6wjZi486oYfAEFtW6I2EkrrsGas/E4Ns8Hi4YqPnpNaiXwwQcfYN26ddDU1ISLiwt8fX0BAPfv34ebm5vMBGs2OQGAv78/Zs+ejatXr2LMmDFgsVioqRGNiKiurgaL1aRFra2t8e233wIA+Hw+5syZg/379+Obb75Bnz59sG/fPoSGhiIgIAB9+rSucjljxgyEhIS0Wp6amoq1a9d26boaeHwk5lUh0NG41/RqDnQyxrUXhXiSUYYQV9m8qK4lFoLLE+DdAVYyOZ6q4ijnQnIPUktwIS4PI1zNMEJOjdDlja0RE418CgVVHFjLuBoqRVHYeiEBsdkVyCqrxSA7I4VHU0mtBCZMmAB/f38UFRXB1dVV+CLz9/cX++KUFXQ6XVgjxdnZGcePH4dAIBCahJKSkjBz5sxW+50+fRrDhw+Hra0tUlJSEBYWBgaDARcXF6SkpIhVAmZmZjAzM+uW60jKrwaXJ+jR+QH/pKVfQFZK4GJ8HtTpNIzz6JkZ15JizGJAT0tdLrkCjXwBtl1IhKY6Hdvfce94BxWF3SJXQNZK4GJ8Hu6/LsV0HxsMasMvIG86FSJqamoKd3d3qKm9mbp4eXnByclJZoJdvnwZNTU1EAgEiI6ORkREBEaNGgUA8PPzA4PBwPfffw8ul4tLly4hJydHuL6ZwsJCXLlyBQsWLAAA2NjY4NGjR6itrUVCQgJsbW1lJq+kxP3tD+gNTuFmXMx0YcRiyMwvUFrTgHuvSxDsYgpDFkMmx1RVaDQaHEzl02/4+3vpeF1Ug+XD+ghDKXsizUpA1n6BKk4jdv+RBEOmBjaMVZ6cKolmArNnz8bRo0ehp6eHWbNmtWvGOHPmjEwEO3PmDLZt2wY+nw8rKyusWrUK48ePBwBoaGjg6NGj2LJlCw4dOgRbW1t8+eWXMDAwEDnGnj17sHbtWmhoNNmhly5dipUrV+Knn37ClClTFFL1NK4XRQY1Q6fTENTHGBSAei6vy6WzLycUgC+g8G7/3m0KasbJhIX47ApUcRq7rWdCNacRKYXVcDbTwdK3VLdSqCQ0KzhZRwgd+PMliqsbsH+KF4yU6ONFoqcxMDBQ+CIdPHhwtwrUTEfKpG/fvvjll1/a3ebQoUMi/7e0tOxwn+4mLrsCjiYsGDCV5yaQB7sneeJlYTXORufAz6FrTrBLcXnQ0qBjlFvPtElLS0u/gKw/LjiNfKQV1+Bxehlm+rGx892eH4bbHSWlCyo5KK/jItDRCFMHybeHcEdIpARWrFgh9t8E6Sir5SKjtA6h3tYdb9yD4DTysfNSokxS5HMr6vE4owzjvSzB0ux5zXg6gzBCqLhGpkpAmUobyBNthhrMdDWRKQMl0KxEH6WXYV6gPdws9eTeOawjJHqKBALJsxFbxu4TRIlvNgX1IqcwINs6M7/HN5UOIaagNzTPBGRdcqCn1geSBLYRs8vOdlVRohIpATc3N4nDGZOSkrokUE+mN2UKt6S9FHlpXyYX4/Ogq6WOYQoqu6uM2BuzQKPJPkxUlr+bqsE2YiI6s7xLuS2qokQlUgKkb7BsiM0qh6Y6Ha4WynMDyIOOqn1KSmpxDRLzqjDdxwaa6srzJaVotDTUYKWvjVQZVxOV1e+mitgKI4Tq4WbVOSWgKkpUIiXg5+fX3XL0eAQCCvHZFfCw1ldICzlF4mDCwpRB1vjtaddS5C/GNZuCepdPRRL6WujAgMmAQEDJzOZsb8xSmtIG8kYWJaVVRYlKpAQePnwIX19fqKur4+HDh+1uGxgYKBPBehrppbWo4vB6nSkIaPpSbW7wEpVWCk9rfXhY60tlF6UoCnpa6pjlxxYmnxHecGD6ALwuqkHEo0z4yqgEwY2kQqx72xVhgx0Ql13eY+oDSYIsSko3lzpXdiUqkRJYsGAB7t+/D2NjY2HilThoNBrxCbRBbFYFgJ7dSaw9mhu8VNU34uSDDEzztUWws2R2fU4jH6+LakCn0zDD1xaNfAHU6D3/RSQpnEY+dv/+QuYOyB/up2PrhQQ82jQCnja9q0ifnQxKSqvTaVj/tivmBNghIbdSaZWoREogOTlZ7L8JkqOvrY53vCx75UygJYYsBi49y4exjqZESkBVIiwUSXc4ILNK6xCTVYFJA6x6pf/FVLeppHRXwkST8qsx9esH+M80L8wNtJedcDJGauP0+fPnweVyWy3ncrk4f/68LGTqUTQ3484pr0fYEHuY6GgqWiSF4mymA2sDbdxMLhTWgmoPZerApKx0R4OSi/FNYz5xQO/0v9BoXS8p/TijDA08gcwq53YXUiuBjRs3Cks2t6S2thYbN26UiVA9heav2HGH7mHnpReYcvQhNp9/Dk4jX9GiKQwajYYQVzNkl9VLFM2iTB2YlBVZOyApisL5uDwYsRgIcjbpimgqDduIiZzyOvAFHX+siONJehnoNGCQnXL3DZFaCVAU1SpngKIoxMXFQV+/d9kNO4J8xYonpF9ThdZbyUUdbqsqERaKpDn6qiVdcUAm5VfjdVENxntaQkOtd0WytaRlSWlpoSgKTzLK4Galp/QzAYnz7pvLRtNoNAwZMkTsNosXL5aZYD0BVYkTljeBjsbQ0qDjZlIRlgS3X3nWzpipEhEWiqQ5+uq9IQ54nF4Gb7YhXC10O+0zuRDXbArq3VnZXSkpnVZSi9Jarkr0u5BYCfzwww+gKArvvfcevvjiC5GvfnV1dVhZWcHaunfaD9uCfMWKR0tDDUOcTPC/lGJU1jdCX7vtL6Vfn2Zj3duuWDDEAbFZvStMURq0NNTgZqmHh6ml+PVpNsInenTqOAIBhYvxebAx1FZ6M0Z3Y9ciTFTasOQn6U0fgH4q8KxLrASa4/9v3rwJKyurXtMVqyuoSpywIgjpZ4abyUW4k1KMd9qoA9TA4+O/N17j1IMsXPsoGB7WxNzYHjQaDYVVHEREZWGGD7tTYZ1PMsqQX8nB8mFOvf4Zb54JZJZJb759/LcVQBU++CRWAr/++qvY5To6OrC3t4erq6vMhOopqNFp2DDWFbP82XiRV0W+YlswvG+TX+B2clGbSuDK8wKU1nKx9C1Hpau8qKyMcrPAt3fTcf1FQaeUwPm/s7J7a1RQS2yEJaXrpd73cXoZHE1YMNVV/mhAiZXAV199JXZ5TU0Nqqqq4Ofnh0OHDrVq7NKbufw8H+t+fYZDM72VOk5YEVgZaKOfpR7+l9J2Q+/TUZnQVKdj2iD5d4BTVQayDWDI1MD1pCJ8PLqvVPtyeQJcfp4PVwtd9LXQ7SYJVYfmktLSJozlV9Yjp7weM3xU476V2PV/69YtsX+PHz/G9evXwefzcfDgQZkJxuVysXnzZoSEhMDb2xvjx4/HxYsXhetDQkLg5eUFb29v4fpmkpOTMW7cOAQEBODEiRMix124cCESEhJkJmdbUBSFb++mgUZTDbugIghxNUVZLVfYba0liXmVeJpZjnf6W/X6FpLSoK5Gx3BXMyTlVyGnXLqX152/fTRkFvCGzuQKPP7bH+DroBrPvUziv2xtbbFmzRrcu3dPFocDAPB4PJiZmeHkyZN4+vQpdu7ciZ07dyI2Nla4zZEjRxAbG4vY2Fj88ccfwuUHDhxAWFgYLl68iKNHj6K4uBgAcOnSJTg4OMDDo3NOM2mISitDQm4Vpg2yJS+xNghxfWMS+icRUZkAgHmBdnKVqScw+u+OazdeFEq13/m/o4JUIaJFXrCNmCir5aKa0yjxPs1Rgary8Sez1kympqaoqKiQ1eHAZDKxatUq4f99fHwwcOBAxMbGwtvbu919c3Jy4OvrCzMzM9jb2yM/Px9aWlr44YcfJCqLXVRUJFQcLUlNTZVY/u/uNc0C3gtykHif3sYAW0MYMjVwK7kI/x7zxnRRWd+I87F56G9rAC8bA8UJqKIMdTYFQ52OG0lFCBsi2f1X08DDjaRC+NkbSR0O2ZN5U0hO8pLST9LLYa6nCVsj1RhHmSmBp0+fwta2+2xgdXV1SEhIwLx584TLNmzYAIFAAGdnZ6xevRqDBg0CADg7OyMqKgosFgu5ubmwtbXFgQMHsGTJEujo6HR4rp9//hlHjhzptKypxTW4kVSEUW7mJBKoHdToNAzra4ZzsbnIr6yHpX7TQ/Pr0xzUN/IxN4DMAjoDS1MdQ5yMcfdVicTN5zNLazGqnzmp0PoPpC0pXVHHxcvCakzwslSZ6CqJlUBbJaRra2uRlJSE06dPY/369TITrCUURWHjxo3w8vJCUFAQAGD//v1Cs05kZCQWL16MS5cuwdraGuvXr8eOHTtw9uxZbNiwAdnZ2SgoKEBgYCDWrFmDgoICjB07FnPmzBF7vhkzZiAkJKTV8tTUVKxdu7ZDeb+7lw4AWDzUsbOX3GsIcW1SAreTizHLnw2BgEJEVCYMmBqY4GWpaPFUlpFu5rj9shj/e1ncbivOlo3k5w+2h7sVCcNtyRslIFmYaHRGOQDAX0X8AYAUSqCtEtIsFgt2dnbYtGkTJk2aJCu5hFAUhe3bt6OwsBDff/+9ULv6+PgIt5k1axYuX76MO3fuYObMmbC2tsa3334LAODz+ZgzZw7279+Pb775Bn369MG+ffsQGhqKgIAA9OnTp9U5zczMYGZm1il5y2q5+O1pDvrb6MPXvncn20hCsIsp1Og03Eouwix/Nu6nliC9pBZLgx1JKG0XGNnPHJvPJeDGi8I2lQCp0NoxbClLSjf7A1TFKQxIoQQUUUKaoijs3LkTL168wIkTJ8BkMtvclkajia1Kefr0aQwfPhy2trZISUlBWFgYGAwGXFxckJKSIlYJdIWIqEw08ARYONRRZaaDikRfWwOD7Axx/3UJOI18nH6YCRoNmO1PTEFdwVxPC/1t9HH7ZREa+QKxNYBUpQeuImkuKS1prsDjjDLoa2vAxUx1QmyVujrUrl27EB8fj++++07Elp+Xl4fo6GhwuVxwuVycPXsWCQkJQlNRM4WFhbhy5YpwFmNjY4NHjx6htrYWCQkJMvdhcBr5OPUwA9YG2hjnYSHTY/dkRvczx8h+ZnhZUA0tDTpG9jMTOuQInWdkP3NUc3jCEgb/hFRo7RhpSkrXc/l4nlMJHztDlUpulJljWNbk5ubixx9/BIPBwLBhw4TLly5dipEjRyI8PBxZWVnQ0NCAk5MTvv76a7DZbJFj7NmzB2vXroWGhoZw35UrV+Knn37ClClT4OnpKVOZL8bloaSGiy3j+0G9F1dflJZZAWz0zzNATFY55gXaw9WCfIXKgpFu5jhwPQXXXhRicJ/WJaF92qgNpAqlDuSJnTETf7WT1NhMbFY5eAJKpUxBgBIrAWtra7x8+bLN9RcuXOjwGIcOHRL5v6WlJX755ZcuyyYOiqJw/F4adDTVMd1XNTIFlQFOIx9bzycQu3Q34GqhCxtDbdxIKsT2d9xEzJOcRj7M9LRIbSsJaFlSur3w2eZ6QW0VjlRWlFYJqBp/pRQjpbAGi4IcJArJIzRB7NLdB41Gw8h+5jjxIAMvC6uFMyyKorDu12dIK6lBxEJ/LBrqiOiMMlLbqg2EheRKa9tVAk8yyqClQYenihU6JDYLGfHdvXSo0WlYQJLDpILYpbuXUX9nD19PfJM9/NX/UnExPg9W+trQ09JAP0s9zA20Rz9LPaIAxNCsBNrzCzTyBYjJrMAAWwMw1FXrtSrVTIDH4+Gvv/7Cs2fPUFpaCgAwNjZG//79ERwcDHX13jmxSMqvwt1XJXinvxXJtpQS0nOhe/FzMIKxDgMNvKaWpnHZFTh08xVcLXTx+YwBKuXAVBR2Rky842UJq3ae7cS8KtQ38lWmVERLJH5rZ2RkYOnSpSgsLISXlxeMjJouNisrCydPnoS5uTm++eYb2Nvbd5esSsvxu03JYYvILEBqmlsj/vaU2KW7Aw01Oq5/9BZSi2tw8kEG3K30cHfdcFCgwNLsnR9t0mJjxMS8wfZIzKuEiY6mWJNZaU0DPhjeB+M8VS/BUeK7YNu2bXBzc8P58+ehrS2qEevr67Fp0yZs27ZNoto8PYmiKg4uxufCz94I/W0NFC2OytHcGnFRELFLdwecRj72/PFC1Pk70Bp7Jss2Mq6nIklCHaeRD0MWAyY6DAgoCpxGvkrdvxIrgbi4OERGRrZSAACgra2NDz74AFOmTJGpcKrAyYcZaORTWDSUzAI6i5aGGvpZ6hFHcDcg1vEek4tFQ4njXRI6ClzoCVnXEnswTE1NERcX1+b6uLg4GBv3ruJTdVweIqKyYG/MxIh+5ooWh0BoBXG8d422xi8qrRTXXhTgRX6VWCWRXiJ9S0pFIfFMYPny5di+fTsePXqEgIAA4Qu/tLQUjx49wuXLl7Fr165uE1QZ+e1pDirrG/Hv0S7tJpEQCIqCON67RlvjN8DWADUNPMSLaYgENClZVZlpSawEpkyZAisrK/z44484dOiQSHSQl5cXvv32W2Ez+t4AX0Dhu3vpMGBqYMogG0WLQyCIhTjeu0Zb49ccTmuiI76HsCopWanCAwIDA3vVi749biQVIqO0Dh8MdwKTQaIsCMoJcbx3jY7Gryco2U69vbhcLsrLm+pmGxoagsHofe0Tj99NA0ONjvmkgTxBySGO967R3vj1BCUrlRKIjIzE6dOnkZKSAoFAAACg0+lwcXHBvHnzMHny5G4RUtl4WViNJxnlmDrIBmZ6WooWh0AgKBBVV7ISK4FvvvkG33zzDcLCwrBp0yZhslhZWRkePnyI3bt3o7i4GEuWLOk2YZWF87G5ALSwkCSHEQgEFUdiJRAREYH9+/dj5MiRIsudnJzg6+sLNzc37Nixo1cogfuvSzDUZ4DKan4CgUBoRmIlUF1d3W4TFhsbG9TWqk5sbFcI6mOChcGkfzCBQFB9JE4WGzx4MHbv3o2srKxW67KysrB3714MHjxYpsJ1RFVVFVatWgVvb28MHToUZ86cAdCksBYuXAgfHx+sWbMGfD5fuM9XX32FEydOdOm87/a3UslCUQQCgfBPJJ4J7Nq1C6tXr8aYMWNgYWEh4hMoKCiAj4+P3JPFdu3aBT6fj7t37yIrKwsLFiyAk5MTnj9/DkNDQzx48ADz58/H9evX8fbbbyM7Oxt37twRKovO8u9fn+FBubZKpYYTCASCOCRWAiYmJoiIiEBycjKeP3+OsrIyUBQlTBbr27dvd8rZirq6Oly9ehXnz5+Hjo4O3NzcMHnyZPz2229gMpnw9fUFg8GAj48PsrOzAQC7d+/Gpk2boKbW/ou7qKgIxcXFrZYnJSUBAGjVhYi8+QjDzbgqFQ9MIBB6DqmpqQCAhoaGLh1H6jwBV1dXuLq6dumksiAjIwMA0KdPH+EyV1dXnDhxAtOmTcPjx48xadIkxMTEYPHixbh8+TKsra3h5eXV4bF//vlnHDlypM31GtFNM4k1t7t2DQQCgdBVXrx4gYEDB3Z6f5mlulZWVuL27duYNGmSrA7ZLnV1dWCxRL/C9fT0UFtbi6lTpyI5ORlTp07FW2+9hUGDBmH+/Pk4efIkPv/8c0RHR8PZ2RmbNm0Sm+g2Y8YMhISEtFoeHx+PnTt3Yu/evUqhCJWR1NRUrF27Fp999hmcnJwULY5SQsaoY8gYdUxycjI2bdrU5fGRmRLIz8/Hxo0b5aYEmExmq2ik6upqsFgsaGpqIjw8XLg8PDwcixYtQkxMDJ4/f46IiAhs2bIFv/32G2bOnNnq2GZmZjAzM2vz3K6urnB3d5fdxfRAnJycyBh1ABmjjiFj1DF6el0LVZdYCTTb1dsiPz+/S4JIS3MHs9TUVKEmTE5OhrOzs8h2CQkJyM7OxtatW3Hs2DF4eHiARqPB09MTycnJcpWZQCAQlA2JlcCoUaNAo7VdLpmiqHbXyxomk4kxY8bgv//9L/bu3YucnBxERkbiiy++EG4jEAiwb98+fPLJJwCachnOnDkDLpeLJ0+ewM3NTW7yEggEgjIisRIwNDTExx9/3GYuwOvXr/H+++/LTDBJ2L59O7Zs2YKhQ4eCxWJh5cqVIlVOz5w5g6CgIGGS2+jRo3H9+nUEBgZiwIABmDFjhlzlJRAIBGVDYiXg6emJgoICWFtbi11fXV0NiqJkJpgk6Onp4dChQ22unzt3rsj/1dXV8fnnn3e3WAQCgaAySKwElixZgrq6ujbXs9nsHt9k3tTUFCtWrICpqamiRVFayBh1DBmjjiFj1DGyGiMaJe/PdwKBQCAoDRLXDmqPsrIynDx5stf0EyAQCISeQqfzBBobG3H79m2cO3cOd+/ehb29fasy0wQCgUBQbqRWAs+ePUNkZCSuXLkCU1NTpKWl4fjx43KvIEogEAiEriOxEjh27BjOnTsHHo+HcePG4dSpU+jbty/c3d3bza4lEAgEgvIisU/giy++wPDhw/H777/jo48+knvVUEXTVu8CQhMbNmyAh4cHvL29hX95eXmKFkuhREREIDQ0FB4eHvjoo49E1qWkpGD69Ono378/JkyYgOjoaAVJqVjaG6OQkBB4eXkJ76fx48crSErFwuVysXnzZoSEhAjH4eLFi8L1Xb2XJJ4J7N69GxcuXEBQUBBCQkIwYcIEkcSsnk5bvQsCAgIULZrSEBYWhn//+9+KFkNpMDMzw/Lly/HgwQOUl5cLlzc2NmLZsmWYMWMGIiIicOXKFSxfvhzXr1+Hvr6+AiWWP22NUTNHjhxBcHCwAiRTHng8HszMzHDy5ElYW1sjJiYGS5cuha2tLTw8PLp8L0k8EwgNDcXJkydx4cIF2NvbY/fu3Rg6dCgEAgESExNFunf1NJp7F6xevbpV7wICoS1Gjx6NkSNHwtDQUGT548ePweFwsGjRIjAYDEycOBE2Nja4du2agiRVHG2NEeENTCYTq1atgq2tLeh0Onx8fDBw4EDExsbK5F6SOkTUysoKy5Ytw59//okvv/wS06ZNw549ezB48GCsX79e2sOpBG31Lnj16pWCJFJOzp49Cz8/P7z77rv49ddfFS2O0vLq1Su4uLiATn/z+JH7STwbNmxAQEAA5s6di6dPnypaHKWgrq4OCQkJcHZ2lsm91KVS0gMHDsTAgQOxZcsW3LhxAxcuXOjK4ZSW9noXEJqYO3cu1q1bB319fURHR2PlypXQ1dXFmDFjFC2a0lFbWwtdXV2RZXp6eqiurlaQRMrJ/v374eHhAQCIjIzE4sWLcenSpTZL1/QGKIrCxo0b4eXlhaCgIDx79qzL95JMksUYDAbGjRuHb775RhaHUzra611AaMLd3R1GRkZQU1ODv78/Zs+ejatXrypaLKWExWKhpqZGZBm5n1rj4+MDLS0taGlpYdasWXBzc8OdO3cULZbCoCgK27dvR2FhIT7//HPQaDSZ3EsyUQI9nZa9C5oR17uA8AY6nS73goKqgrOzM1JSUiAQCITLkpKSyP3UATQardfeUxRFYefOnXjx4gWOHz8OJpMJQDb3ElECEtCyd0FNTQ2Sk5MRGRmJ0NBQRYumNFy+fBk1NTUQCASIjo5GREQERo0apWixFAqPx0NDQwN4PB4EAgEaGhrQ2NgIPz8/MBgMfP/99+Byubh06RJycnJ65Xi1NUZ5eXmIjo4Gl8sFl8vF2bNnkZCQgKCgIEWLrBB27dqF+Ph4fPfdd9DR0REul8W9RArISUhVVRW2bNmCu3fvgsViYdmyZZg9e7aixVIaZs+ejZcvX4LP58PKygpz5swR27qzN3H48GEcOXJEZNnkyZPxySef4OXLl9iyZQtevnwJW1tb7NixA76+vgqSVHG0NUaLFi3CmjVrkJWVBQ0NDTg5OWH16tXw9/dXkKSKIzc3FyEhIWAwGFBXf+PGXbp0Kd5///0u30tECRAIBEIvhpiDCAQCoRdDlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL4YoAQKBQOjFECVAIBAIvRiiBAi9lpCQEPzyyy9yP29tbS2Cg4ORnZ3d6WM8evQIffv2BY/H67I8ubm5GDp0KKmK20shGcOEHklH7U9PnToFZ2dnMJlMaGlpyUmqJo4ePYq0tDR89tlnnT4Gl8tFZWUlTE1NZSLTunXrYG9vj+XLl8vkeATVgSgBQo+kuLhY+O9vv/0Wz549w+HDh4XL9PX1wWAw5C6XQCBASEgI9uzZgyFDhsj9/G1x7949bNmyBbdu3RJpUELo+ZBfm9AjMTU1Ff5pa2tDQ0NDZBmDwRAxB+Xk5KBv3764du0aQkND4eXlhbCwMJSXl+PKlSsYOXIkfH19sXfvXpFyxmVlZVizZg18fHzg7++PNWvWiO2V20xcXBzKy8tFCqFFRkYiODgYf/zxh7CZeHh4OPh8Pr744gv4+/sjODhYpGnTP81Bhw8fxsyZMxEREYGgoCD4+/tj//79QlkpisKBAwcwdOhQeHp6YsSIEfjpp5+ExwsICEB5eTni4uJkMv4E1aFLncUIhJ7Gl19+ic2bN0NXVxerVq3CqlWrwGKx8NVXXyEvLw8rVqxAQEAAQkJCAAArV66Eubk5zpw5AxqNhv/85z9Yu3Ytjh8/Lvb4sbGxcHFxEakGCQAVFRW4dOkSvv76a+F50tPT4enpiZ9++glXr17F1q1bMXToUBgZGYk99suXL8Fms3Hy5Emkp6dj9erV8PHxQUhICK5cuYLff/8dX3zxBSwsLJCdnS3SjERdXR2urq6IiYnBwIEDZTSaBFWAKAECoQVLly6Fn58fAGDq1Kk4ePAg7t+/D2NjY7i4uMDf3x+PHz9GSEgInjx5gvT0dJw4cUL4Ug8PD0dwcDAKCgpgYWHR6vj5+fli7fhcLhe7d++GiYmJ8DyFhYX46KOPAABLlizBsWPHEBcXJ1RA/0RdXR27du2CpqYmnJyc4O/vjydPniAkJAQFBQWws7PDwIEDQaPRxLZoNDU1RV5eXqfHjqCaECVAILSgpUPZ2NgYRkZGMDY2Fi4zMTFBWVkZACAlJQVlZWVia7dnZ2eLVQJcLlesL8LIyAgmJiYi52nZO1ZNTQ0GBgbCc4vDzs4OmpqaIscoLS0FAIwePRrff/89xo4di+DgYIwcOVKo7JrR1NQEh8Np8/iEnglRAgRCC1qaaWg0WiuzDY1GA5/PB9AU6slms3Hs2LFWxzE3Nxd7fAMDA+Tm5rZ73vbO3V4cR3uy2tjY4Nq1a7hz5w7u3buH999/H5MnT8bWrVuF21dWVnYYVUXoeRDHMIHQSVxdXZGfnw8dHR3Y2dmJ/LUVdurq6oq0tDQ5S9oEk8nE22+/jd27d2P37t349ddfRdanpqbC1dVVIbIRFAdRAgRCJwkKCoKLiwtWrFiB6OhoZGdn4/79+yJf1//E398fRUVFKCgokKOkwLlz5xAZGYnU1FSkp6fj5s2bcHBwEK4vKChAYWFhr2zf2NshSoBA6CR0Oh3Hjx+Hg4MDVqxYgfHjxyM8PFzElv9PjI2NMXToUFy9elWOkgK6urr48ccfMXXqVEybNg0VFRU4ePCgcP3Vq1cRFBQEMzMzucpFUDwkWYxAkDOxsbHYsGEDLl++DDU1NUWLA4FAgLFjx2LPnj3w8fFRtDgEOUNmAgSCnPH29sb8+fNRWFioaFEAAEVFRZg3bx5RAL0UMhMgEAiEXgyZCRAIBEIvhigBAoFA6MUQJUAgEAi9GKIECAQCoRdDlACBQCD0YogSIBAIhF4MUQIEAoHQiyFKgEAgEHoxRAkQCARCL+b/AcTf3RS0yumnAAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sns.set_context('paper')\n", - "sns.set_style('ticks')\n", - "fig, ax = plt.subplots(figsize=(4, 2))\n", - "sns.lineplot(df, x='time', y='utilisation', marker='.', markersize=10.)\n", - "ax.set_xlabel('Time (mins)')\n", - "ax.set_ylabel('A100 Utilisation (%)')\n", - "ax.yaxis.set_major_formatter(PercentFormatter(100.))\n", - "ax.set_ylim([0., 100.])\n", - "ax.set_xlim([0., 20.])\n", - "ax.axhspan(80, 90., color='green', alpha=.2)\n", - "ax.text(1.2, 110., 'Epoch 1')\n", - "ax.text(7.6, 110., 'Epoch 2')\n", - "ax.text(13.6, 110., 'Epoch 3')\n", - "fig.tight_layout()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "fig.savefig('utilisation.pdf')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "zoobot39_dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From edc58bc60959fafe6fa79f15078c1e78309bbe35 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 14 Mar 2024 10:24:38 -0400 Subject: [PATCH 273/307] add from_scratch override --- zoobot/pytorch/training/finetune.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 37d79c8d..f1ad8b5e 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -79,7 +79,8 @@ def __init__( cosine_schedule=False, warmup_epochs=10, max_cosine_epochs=100, - max_learning_rate_reduction_factor=0.01 + max_learning_rate_reduction_factor=0.01, + from_scratch=False ): super().__init__() @@ -123,6 +124,8 @@ def __init__( self.max_cosine_epochs = max_cosine_epochs self.max_learning_rate_reduction_factor = max_learning_rate_reduction_factor + self.from_scratch = from_scratch + self.always_train_batchnorm = always_train_batchnorm if self.always_train_batchnorm: raise NotImplementedError('Temporarily deprecated, always_train_batchnorm=True not supported') @@ -159,6 +162,11 @@ def configure_optimizers(self): logging.info(f'Encoder architecture to finetune: {type(self.encoder)}') + if self.from_scratch: + logging.warning('self.from_scratch is True, training everything and ignoring all settings') + params += [{"params": self.encoder.parameters(), "lr": lr}] + return torch.optim.AdamW(params, weight_decay=self.weight_decay) + if isinstance(self.encoder, timm.models.EfficientNet): # includes v2 # TODO for now, these count as separate layers, not ideal early_tuneable_layers = [self.encoder.conv_stem, self.encoder.bn1] From fa60f668b84d60d718d5c14a6af9874fc9dbbf71 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 15 Mar 2024 13:43:57 -0400 Subject: [PATCH 274/307] load 0-1 webdatasets, add note --- zoobot/pytorch/datasets/webdatamodule.py | 4 ++-- zoobot/pytorch/datasets/webdataset_utils.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 05ce84e4..27dd586d 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -80,8 +80,8 @@ def make_image_transform(self, mode="train"): crop_ratio_bounds=self.crop_ratio_bounds, resize_after_crop=self.resize_after_crop, pytorch_greyscale=not self.color, - to_float=True # wrong, webdataset rgb decoder already converts to 0-1 float - # TODO this must be changed! will be different for new model training runs + to_float=False # True was wrong, webdataset rgb decoder already converts to 0-1 float + # TODO now changed on dev branch will be different for new model training runs ) # A.Compose object # logging.warning('Minimal augmentations for speed test') diff --git a/zoobot/pytorch/datasets/webdataset_utils.py b/zoobot/pytorch/datasets/webdataset_utils.py index a542c014..96d9afbd 100644 --- a/zoobot/pytorch/datasets/webdataset_utils.py +++ b/zoobot/pytorch/datasets/webdataset_utils.py @@ -63,6 +63,8 @@ def df_to_wds(df: pd.DataFrame, label_cols, save_loc: str, n_shards: int, sparse # in augs that could be 0.x-1.0, and here a pre-crop to 0.8 i.e. 340px # but this would change the centering # let's stick to small boundary crop and 0.75-0.85 in augs + + # turn these off for current euclidized images, already 300x300 A.CenterCrop( height=400, width=400, From 0fe5cecac4286bb23f6ac77f83f5e88578803a77 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Mar 2024 18:14:46 -0400 Subject: [PATCH 275/307] make models portable --- .gitignore | 3 +- setup.py | 1 + tests/test_from_hub.py | 38 +++++++++++++ zoobot/pytorch/estimators/define_model.py | 5 +- zoobot/pytorch/training/finetune.py | 64 +++++++++++++++------- zoobot/pytorch/training/representations.py | 50 ++++++++++++++--- 6 files changed, 132 insertions(+), 29 deletions(-) create mode 100644 tests/test_from_hub.py diff --git a/.gitignore b/.gitignore index d7ae58f9..ff65996f 100755 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ hparams.yaml data/pretrained_models -*.tar \ No newline at end of file +*.tar +*.ckpt \ No newline at end of file diff --git a/setup.py b/setup.py index 1bf92be6..4f9d7188 100755 --- a/setup.py +++ b/setup.py @@ -112,6 +112,7 @@ 'pyarrow', # to read parquet, which is very handy for big datasets # for saving metrics to weights&biases (cloud service, free within limits) 'wandb', + 'huggingface_hub', # login may be required 'setuptools', # no longer pinned 'galaxy-datasets>=0.0.15' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) ] diff --git a/tests/test_from_hub.py b/tests/test_from_hub.py new file mode 100644 index 00000000..9bc1c115 --- /dev/null +++ b/tests/test_from_hub.py @@ -0,0 +1,38 @@ +import pytest + +import timm +import torch + + +def test_get_encoder(): + model = timm.create_model("hf_hub:mwalmsley/zoobot-encoder-efficientnet_b0", pretrained=True) + assert model(torch.rand(1, 3, 224, 224)).shape == (1, 1280) + + +def test_get_finetuned(): + # checkpoint_loc = 'https://huggingface.co/mwalmsley/zoobot-finetuned-is_tidal/resolve/main/3.ckpt' pickle problem via lightning + # checkpoint_loc = '/home/walml/Downloads/3.ckpt' # works when downloaded manually + + from huggingface_hub import hf_hub_download + + REPO_ID = "mwalmsley/zoobot-finetuned-is_tidal" + FILENAME = "4.ckpt" + + downloaded_loc = hf_hub_download( + repo_id=REPO_ID, + filename=FILENAME, + ) + from zoobot.pytorch.training import finetune + model = finetune.FinetuneableZoobotClassifier.load_from_checkpoint(downloaded_loc, map_location='cpu') # hub_name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', + assert model(torch.rand(1, 3, 224, 224)).shape == (1, 2) + + + +# def test_get_finetuned_from_local(): +# # checkpoint_loc = '/home/walml/repos/zoobot/tests/convnext_nano_finetuned_linear_is-lsb.ckpt' +# checkpoint_loc = '/home/walml/repos/zoobot-foundation/results/finetune/is-lsb/debug/checkpoints/4.ckpt' + +# from zoobot.pytorch.training import finetune +# # if originally trained with a direct in-memory checkpoint, must specify the hub name manually. otherwise it's saved as an hparam. +# model = finetune.FinetuneableZoobotClassifier.load_from_checkpoint(checkpoint_loc, map_location='cpu') # hub_name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ) +# assert model(torch.rand(1, 3, 224, 224)).shape == (1, 2) \ No newline at end of file diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index d35dc151..d04ab746 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -480,4 +480,7 @@ def schema_to_campaigns(schema): if __name__ == '__main__': encoder = get_pytorch_encoder(channels=1) dim = get_encoder_dim(encoder, channels=1) - print(dim) \ No newline at end of file + print(dim) + + + ZoobotTree.load_from_checkpoint \ No newline at end of file diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index f1ad8b5e..0ef638fb 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -43,11 +43,17 @@ class FinetuneableZoobotAbstract(pl.LightningModule): Both :class:`FinetuneableZoobotClassifier` and :class:`FinetuneableZoobotTree` can (and should) be passed any of these arguments to customise finetuning. - You could subclass this class to solve new finetuning tasks (like regression) - see :ref:`advanced_finetuning`. + Any FinetuneableZoobot model can be loaded in one of three ways: + - HuggingFace name e.g. FinetuneableZoobotX(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. + - Any PyTorch model in memory e.g. FinetuneableZoobotX(encoder=some_model, ...) + - ZoobotTree checkpoint e.g. FinetuneableZoobotX(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + + You could subclass this class to solve new finetuning tasks - see :ref:`advanced_finetuning`. Args: - checkpoint_loc (str, optional): Path to encoder checkpoint to load (likely a saved ZoobotTree). Defaults to None. - encoder (pl.LightningModule, optional): Alternatively, pass an encoder directly. Load with :func:`zoobot.pytorch.training.finetune.load_pretrained_encoder`. + name (str, optional): Name of a model on HuggingFace Hub e.g.'hf_hub:mwalmsley/zoobot-encoder-convnext_nano'. Defaults to None. + encoder (torch.nn.Module, optional): A PyTorch model already loaded in memory + zoobot_checkpoint_loc (str, optional): Path to ZoobotTree lightning checkpoint to load. Loads with Load with :func:`zoobot.pytorch.training.finetune.load_pretrained_encoder`. Defaults to None. encoder_dim (int, optional): Output dimension of encoder. Defaults to 1280 (EfficientNetB0's encoder dim). lr_decay (float, optional): For each layer i below the head, reduce the learning rate by lr_decay ^ i. Defaults to 0.75. weight_decay (float, optional): AdamW weight decay arg (i.e. L2 penalty). Defaults to 0.05. @@ -61,26 +67,39 @@ class FinetuneableZoobotAbstract(pl.LightningModule): def __init__( self, - # can provide either zoobot_checkpoint_loc, and will load this model as encoder... - zoobot_checkpoint_loc=None, + + # load a pretrained timm encoder saved on huggingface hub + # (aimed at most users, easiest way to load published models) + name=None, + # ...or directly pass any model to use as encoder (if you do this, you will need to keep it around for later) - encoder=None, + # (aimed at tinkering with new architectures e.g. SSL) + encoder=None, # use any torch model already loaded in memory (must have .forward() method) + + # load a pretrained zoobottree model and grab the encoder (a timm model) + # requires the exact same zoobot version used for training, not very portable + # (aimed at supervised experiments) + zoobot_checkpoint_loc=None, + + # finetuning settings n_blocks=0, # how many layers deep to FT lr_decay=0.75, weight_decay=0.05, learning_rate=1e-4, # 10x lower than typical, you may like to experiment dropout_prob=0.5, always_train_batchnorm=False, # temporarily deprecated - prog_bar=True, - visualize_images=False, # upload examples to wandb, good for debugging - seed=42, n_layers=0, # for backward compat., n_blocks preferred # these args are for the optional learning rate scheduler, best not to use unless you've tuned everything else already cosine_schedule=False, warmup_epochs=10, max_cosine_epochs=100, max_learning_rate_reduction_factor=0.01, - from_scratch=False + # escape hatch for 'from scratch' baselines + from_scratch=False, + # debugging utils + prog_bar=True, + visualize_images=False, # upload examples to wandb, good for debugging + seed=42 ): super().__init__() @@ -95,17 +114,22 @@ def __init__( self.save_hyperparameters(ignore=['encoder']) # never serialise the encoder, way too heavy # if you need the encoder to recreate, pass when loading checkpoint e.g. # FinetuneableZoobotTree.load_from_checkpoint(loc, encoder=encoder) - - if zoobot_checkpoint_loc is not None: - assert encoder is None, 'Cannot pass both checkpoint to load and encoder to use' - self.encoder = load_pretrained_zoobot(zoobot_checkpoint_loc) + + if name is not None: + assert encoder is None, 'Cannot pass both name and encoder to use' + self.encoder = timm.create_model(name, pretrained=True) + self.encoder_dim = self.encoder.num_features + + elif zoobot_checkpoint_loc is not None: + assert encoder is None, 'Cannot pass both checkpoint to load and encoder to use' + self.encoder = load_pretrained_zoobot(zoobot_checkpoint_loc) # extracts the timm encoder + self.encoder_dim = self.encoder.num_features else: - assert zoobot_checkpoint_loc is None, 'Cannot pass both checkpoint to load and encoder to use' - assert encoder is not None, 'Must pass either checkpoint to load or encoder to use' - self.encoder = encoder - - # TODO read as encoder property - self.encoder_dim = define_model.get_encoder_dim(self.encoder) + assert zoobot_checkpoint_loc is None, 'Cannot pass both checkpoint to load and encoder to use' + assert encoder is not None, 'Must pass either checkpoint to load or encoder to use' + self.encoder = encoder + # work out encoder dim 'manually' + self.encoder_dim = define_model.get_encoder_dim(self.encoder) # for backwards compat. if n_layers: diff --git a/zoobot/pytorch/training/representations.py b/zoobot/pytorch/training/representations.py index dd8912a1..1c5fab19 100644 --- a/zoobot/pytorch/training/representations.py +++ b/zoobot/pytorch/training/representations.py @@ -1,17 +1,53 @@ +import logging import pytorch_lightning as pl +from timm import create_model + + class ZoobotEncoder(pl.LightningModule): - # very simple wrapper to turn pytorch model into lightning module - # useful when we want to use lightning to make predictions with our encoder - # (i.e. to get representations) - def __init__(self, encoder, pyramid=False) -> None: - super().__init__() + def __init__(self, encoder): + logging.info('ZoobotEncoder: using provided in-memory encoder') self.encoder = encoder # plain pytorch module e.g. Sequential - if pyramid: - raise NotImplementedError('Will eventually support resetting timm classifier to get FPN features') + def forward(self, x): if isinstance(x, list) and len(x) == 1: return self(x[0]) return self.encoder(x) + + @classmethod + def load_from_name(cls, name: str): + """ + e.g. ZoobotEncoder.load_from_name('hf_hub:mwalmsley/zoobot-encoder-convnext_nano') + Args: + name (str): huggingface hub name to load + + Returns: + nn.Module: timm model + """ + timm_model = create_model(name) + return cls(timm_model) + + + + + +class ZoobotEncoder(pl.LightningModule): + # very simple wrapper to turn pytorch model into lightning module + # useful when we want to use lightning to make predictions with our encoder + # (i.e. to get representations) + + # pretrained_cfg, pretrained_cfg_overlay=timm_kwargs + def __init__(self, architecture_name=None, channels=None, timm_kwargs={}) -> None: + super().__init__() + + logging.info('ZoobotEncoder: using timm encoder') + self.encoder = + + # if pyramid: + # raise NotImplementedError('Will eventually support resetting timm classifier to get FPN features') + + +# def save_timm_encoder(): + From e0fd96d0bc127b40156eb1d7209fa44d750aaafa Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Mar 2024 18:15:31 -0400 Subject: [PATCH 276/307] typo --- zoobot/pytorch/training/representations.py | 24 +--------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/zoobot/pytorch/training/representations.py b/zoobot/pytorch/training/representations.py index 1c5fab19..09b379a4 100644 --- a/zoobot/pytorch/training/representations.py +++ b/zoobot/pytorch/training/representations.py @@ -28,26 +28,4 @@ def load_from_name(cls, name: str): """ timm_model = create_model(name) return cls(timm_model) - - - - - -class ZoobotEncoder(pl.LightningModule): - # very simple wrapper to turn pytorch model into lightning module - # useful when we want to use lightning to make predictions with our encoder - # (i.e. to get representations) - - # pretrained_cfg, pretrained_cfg_overlay=timm_kwargs - def __init__(self, architecture_name=None, channels=None, timm_kwargs={}) -> None: - super().__init__() - - logging.info('ZoobotEncoder: using timm encoder') - self.encoder = - - # if pyramid: - # raise NotImplementedError('Will eventually support resetting timm classifier to get FPN features') - - -# def save_timm_encoder(): - + \ No newline at end of file From bb6c40311366cba82eb7a1eeed593638411ecf29 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 19 Mar 2024 19:33:15 -0400 Subject: [PATCH 277/307] try on colab --- tests/test_from_hub.py | 7 ++++++- zoobot/pytorch/training/finetune.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/test_from_hub.py b/tests/test_from_hub.py index 9bc1c115..159f22c5 100644 --- a/tests/test_from_hub.py +++ b/tests/test_from_hub.py @@ -16,7 +16,7 @@ def test_get_finetuned(): from huggingface_hub import hf_hub_download REPO_ID = "mwalmsley/zoobot-finetuned-is_tidal" - FILENAME = "4.ckpt" + FILENAME = "FinetuneableZoobotClassifier.ckpt" downloaded_loc = hf_hub_download( repo_id=REPO_ID, @@ -26,7 +26,12 @@ def test_get_finetuned(): model = finetune.FinetuneableZoobotClassifier.load_from_checkpoint(downloaded_loc, map_location='cpu') # hub_name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', assert model(torch.rand(1, 3, 224, 224)).shape == (1, 2) +def test_get_finetuned_class_method(): + from zoobot.pytorch.training import finetune + + model = finetune.FinetuneableZoobotClassifier.load_from_name('mwalmsley/zoobot-finetuned-is_tidal', map_location='cpu') + assert model(torch.rand(1, 3, 224, 224)).shape == (1, 2) # def test_get_finetuned_from_local(): # # checkpoint_loc = '/home/walml/repos/zoobot/tests/convnext_nano_finetuned_linear_is-lsb.ckpt' diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 0ef638fb..dd7a66a3 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -377,6 +377,13 @@ def on_test_batch_end(self, outputs: dict, batch, batch_idx: int, dataloader_idx def upload_images_to_wandb(self, outputs, batch, batch_idx): raise NotImplementedError('Must be subclassed') + + @classmethod + def load_from_name(cls, name: str, **kwargs): + downloaded_loc = download_from_name(cls.__name__, name, **kwargs) + return cls.load_from_checkpoint(downloaded_loc, **kwargs) # trained on GPU, may need map_location='cpu' if you get a device error + + @@ -396,6 +403,8 @@ class FinetuneableZoobotClassifier(FinetuneableZoobotAbstract): """ + + def __init__( self, num_classes: int, @@ -762,3 +771,18 @@ def get_trainer( ) return trainer + + +def download_from_name(class_name: str, hub_name: str, **kwargs): + from huggingface_hub import hf_hub_download + + if hub_name.startswith('hf_hub:'): + logging.info('Passed name with hf_hub: prefix, dropping prefix') + repo_id = hub_name.split('hf_hub:')[1] + else: + repo_id = hub_name + downloaded_loc = hf_hub_download( + repo_id=repo_id, + filename=f"{class_name}.ckpt" + ) + return downloaded_loc \ No newline at end of file From 1c17b02d329af210f05ca599c9b455ec6d65b98a Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 13:01:10 -0400 Subject: [PATCH 278/307] enforce 0-1 input --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 1877cdbc..728ac751 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -265,7 +265,15 @@ def train_default_zoobot_from_scratch( # inference_transform=transforms.GalaxyViewTransform(inference_transform_cfg), ) + # debug - check range of loaded images, should be 0-1 datamodule.setup(stage='fit') + for (images, _) in datamodule.train_dataloader(): + logging.info(f'Using batches of {images.shape[0]} images for training') + logging.info('First batch image min/max: {}/{}'.format(images.min(), images.max())) + assert images.max() <= 1.0 + assert images.min() >= 0.0 + break + # exit() # these args are automatically logged lightning_model = define_model.ZoobotTree( From 8543f5f0f70c8e58a5c236d6e31ef516fae9fcae Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 13:51:10 -0400 Subject: [PATCH 279/307] bump galaxy-datasets --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4f9d7188..cbf095ea 100755 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ 'wandb', 'huggingface_hub', # login may be required 'setuptools', # no longer pinned - 'galaxy-datasets>=0.0.15' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) + 'galaxy-datasets>=0.0.16' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) ] ) From 48013c5b6e17cdc5e83f2f15ed55a5da24bcf2f0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 14:25:15 -0400 Subject: [PATCH 280/307] add webdataset --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index cbf095ea..651f2cf7 100755 --- a/setup.py +++ b/setup.py @@ -112,6 +112,7 @@ 'pyarrow', # to read parquet, which is very handy for big datasets # for saving metrics to weights&biases (cloud service, free within limits) 'wandb', + 'webdataset', # for reading webdataset files 'huggingface_hub', # login may be required 'setuptools', # no longer pinned 'galaxy-datasets>=0.0.16' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) From 33b612d9ba535818cc0afbb2be5a3b2dd210b048 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 14:52:21 -0400 Subject: [PATCH 281/307] require schema-like args --- tests/pytorch/test_define_model.py | 3 ++- zoobot/pytorch/estimators/define_model.py | 17 +++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/pytorch/test_define_model.py b/tests/pytorch/test_define_model.py index 3805777d..f7628d22 100644 --- a/tests/pytorch/test_define_model.py +++ b/tests/pytorch/test_define_model.py @@ -10,6 +10,7 @@ def schema(): def test_ZoobotTree_init(schema): model = define_model.ZoobotTree( output_dim=12, - question_index_groups=schema.question_index_groups, + question_answer_pairs=schema.question_answer_pairs, + dependencies=schema.dependencies ) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index d04ab746..8670ba0e 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -177,7 +177,6 @@ class ZoobotTree(GenericLightningModule): Args: output_dim (int): Output dimension of model's head e.g. 34 for predicting a 34-answer decision tree. - question_index_groups (List): Mapping of which label indices are part of the same question. See :ref:`training_on_vote_counts`. architecture_name (str, optional): Architecture to use. Passed to timm. Must be in timm.list_models(). Defaults to "efficientnet_b0". channels (int, optional): Num. input channels. Probably 3 or 1. Defaults to 1. test_time_dropout (bool, optional): Apply dropout at test time, to pretend to be Bayesian. Defaults to True. @@ -192,7 +191,7 @@ def __init__( self, output_dim: int, # in the simplest case, this is all zoobot needs: grouping of label col indices as questions - question_index_groups: List=None, + # question_index_groups: List=None, # BUT # if you pass these, it enables better per-question and per-survey logging (because we have names) # must be passed as simple dicts, not objects, so can't just pass schema in @@ -219,7 +218,6 @@ def __init__( super().__init__( # these all do nothing, they are simply saved by lightning as hparams output_dim, - question_index_groups, question_answer_pairs, dependencies, architecture_name, @@ -236,13 +234,12 @@ def __init__( logging.info('Generic __init__ complete - moving to Zoobot __init__') - if question_answer_pairs is not None: - logging.info('question_index_groups/dependencies passed to Zoobot, constructing schema in __init__') - # assert question_index_groups is None, "Don't pass both question_index_groups and question_answer_pairs/dependencies" - assert dependencies is not None - self.schema = schemas.Schema(question_answer_pairs, dependencies) - # replace with schema-derived version - question_index_groups = self.schema.question_index_groups + # logging.info('question_index_groups/dependencies passed to Zoobot, constructing schema in __init__') + # assert question_index_groups is None, "Don't pass both question_index_groups and question_answer_pairs/dependencies" + assert dependencies is not None + self.schema = schemas.Schema(question_answer_pairs, dependencies) + # replace with schema-derived version + question_index_groups = self.schema.question_index_groups self.setup_metrics() From 4b83e10aa823d397f8aec8dbe043adf8d53960a0 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 15:03:24 -0400 Subject: [PATCH 282/307] bump again --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 651f2cf7..14167d97 100755 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ 'webdataset', # for reading webdataset files 'huggingface_hub', # login may be required 'setuptools', # no longer pinned - 'galaxy-datasets>=0.0.16' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) + 'galaxy-datasets>=0.0.17' # for dataset loading in both TF and Torch (see github/mwalmsley/galaxy-datasets) ] ) From 31d03baee56b945ac9921c847d3faf0f9d7deefe Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 21 Mar 2024 15:10:16 -0400 Subject: [PATCH 283/307] bump to python 3.9 --- .github/workflows/run_CI.yml | 2 +- Dockerfile.tf | 14 -------------- docker-compose-tf.yml | 11 ----------- setup.py | 2 +- 4 files changed, 2 insertions(+), 27 deletions(-) delete mode 100644 Dockerfile.tf delete mode 100644 docker-compose-tf.yml diff --git a/.github/workflows/run_CI.yml b/.github/workflows/run_CI.yml index 97a93d03..b9a1ce31 100644 --- a/.github/workflows/run_CI.yml +++ b/.github/workflows/run_CI.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9"] # zoobot should support these (many academics not on 3.9) + python-version: ["3.9"] # zoobot should support these experimental: [false] include: - python-version: "3.10" # test the next python version but allow it to fail diff --git a/Dockerfile.tf b/Dockerfile.tf deleted file mode 100644 index e7cfa547..00000000 --- a/Dockerfile.tf +++ /dev/null @@ -1,14 +0,0 @@ -FROM tensorflow/tensorflow:2.8.0 - -# if you have a supported nvidia GPU and https://github.com/NVIDIA/nvidia-docker -# FROM tensorflow/tensorflow:2.8.0-gpu - -WORKDIR /usr/src/zoobot - -# install dependencies but remove tensorflow as it's in the base image -COPY README.md . -COPY setup.py . -RUN pip install -U .[tensorflow] - -# install package -COPY . . diff --git a/docker-compose-tf.yml b/docker-compose-tf.yml deleted file mode 100644 index 7c3b5167..00000000 --- a/docker-compose-tf.yml +++ /dev/null @@ -1,11 +0,0 @@ -version: '3' - -services: - zoobot: - image: zoobot:tensorflow - build: - context: ./ - dockerfile: Dockerfile.tf - volumes: - # inject the code at run time to allow edits etc - - ./:/usr/src/zoobot diff --git a/setup.py b/setup.py index 14167d97..428e7172 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ "Environment :: GPU :: NVIDIA CUDA" ], packages=setuptools.find_packages(), - python_requires=">=3.8", # recommend 3.9 for new users. TF needs >=3.7.2, torchvision>=3.8 + python_requires=">=3.9", # bumped to 3.9 for typing extras_require={ 'pytorch-cpu': [ # A100 GPU currently only seems to support cuda 11.3 on manchester cluster, let's stick with this version for now From 8e3498f7e6030df4d8e58fd4e8ac6138f2c699b8 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 29 Mar 2024 08:04:18 -0400 Subject: [PATCH 284/307] grab lightly cosine --- zoobot/pytorch/training/finetune.py | 203 ++++++++++++++++++++++++---- 1 file changed, 174 insertions(+), 29 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index dd7a66a3..52ff57bb 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -1,8 +1,6 @@ -# Based on Inigo's BYOL FT step -# https://github.com/inigoval/finetune/blob/main/finetune.py import logging import os -from typing import Any, Union +from typing import Any, Union, Optional import warnings from functools import partial @@ -35,13 +33,13 @@ def freeze_batchnorm_layers(model): class FinetuneableZoobotAbstract(pl.LightningModule): """ - Parent class of :class:`FinetuneableZoobotClassifier` and :class:`FinetuneableZoobotTree`. + Parent class of :class:`FinetuneableZoobotClassifier`, :class:`FinetuneableZoobotRegressor`, :class:`FinetuneableZoobotTree`. You cannot use this class directly - you must use the child classes above instead. - This class defines the finetuning methods that those child classes both use. - For example: when provided `checkpoint_loc`, it will load the encoder from that checkpoint. - Both :class:`FinetuneableZoobotClassifier` and :class:`FinetuneableZoobotTree` - can (and should) be passed any of these arguments to customise finetuning. + This class defines the shared finetuning args and methods used by those child classes. + For example: + - When provided `name`, it will load the HuggingFace encoder with that name (see below for more). + - When provided `learning_rate` it will set the optimizer to use that learning rate. Any FinetuneableZoobot model can be loaded in one of three ways: - HuggingFace name e.g. FinetuneableZoobotX(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. @@ -54,12 +52,17 @@ class FinetuneableZoobotAbstract(pl.LightningModule): name (str, optional): Name of a model on HuggingFace Hub e.g.'hf_hub:mwalmsley/zoobot-encoder-convnext_nano'. Defaults to None. encoder (torch.nn.Module, optional): A PyTorch model already loaded in memory zoobot_checkpoint_loc (str, optional): Path to ZoobotTree lightning checkpoint to load. Loads with Load with :func:`zoobot.pytorch.training.finetune.load_pretrained_encoder`. Defaults to None. - encoder_dim (int, optional): Output dimension of encoder. Defaults to 1280 (EfficientNetB0's encoder dim). + + n_blocks (int, optional): lr_decay (float, optional): For each layer i below the head, reduce the learning rate by lr_decay ^ i. Defaults to 0.75. weight_decay (float, optional): AdamW weight decay arg (i.e. L2 penalty). Defaults to 0.05. learning_rate (float, optional): AdamW learning rate arg. Defaults to 1e-4. dropout_prob (float, optional): P of dropout before final output layer. Defaults to 0.5. - always_train_batchnorm (bool, optional): If True, do not update batchnorm stats during finetuning. Defaults to True. + always_train_batchnorm (bool, optional): Temporarily deprecated. Previously, if True, do not update batchnorm stats during finetuning. Defaults to True. + cosine_schedule (bool, optional): Reduce the learning rate each epoch according to a cosine schedule, after warmup_epochs. Defaults to False. + warmup_epochs (int, optional): Linearly increase the learning rate from 0 to `learning_rate` over the first `warmup_epochs` epochs, before applying cosine schedule. No effect if cosine_schedule=False. + max_cosine_epochs (int, optional): Epochs for the scheduled learning rate to decay to final learning rate (below). Warmup epochs don't count. No effect if `cosine_schedule=False`. + max_learning_rate_reduction_factor (float, optional): prog_bar (bool, optional): Print progress bar during finetuning. Defaults to True. visualize_images (bool, optional): Upload example images to WandB. Good for debugging but slow. Defaults to False. seed (int, optional): random seed to use. Defaults to 42. @@ -88,10 +91,10 @@ def __init__( learning_rate=1e-4, # 10x lower than typical, you may like to experiment dropout_prob=0.5, always_train_batchnorm=False, # temporarily deprecated - n_layers=0, # for backward compat., n_blocks preferred + # n_layers=0, # for backward compat., n_blocks preferred. Now removed in v2. # these args are for the optional learning rate scheduler, best not to use unless you've tuned everything else already cosine_schedule=False, - warmup_epochs=10, + warmup_epochs=0, max_cosine_epochs=100, max_learning_rate_reduction_factor=0.01, # escape hatch for 'from scratch' baselines @@ -264,25 +267,25 @@ def configure_optimizers(self): logging.info('Optimizer ready, configuring scheduler') if self.cosine_schedule: - # logging.info('Using lightly cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) - # from lightly.utils.scheduler import CosineWarmupScheduler # new dependency for zoobot, TBD - maybe just copy - # # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers - # # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - # lr_scheduler = CosineWarmupScheduler( - # optimizer=opt, - # warmup_epochs=self.warmup_epochs, - # max_epochs=self.max_cosine_epochs, - # start_value=self.learning_rate, - # end_value=self.learning_rate * self.max_learning_rate_reduction_factor, - # ) - - logging.info('Using CosineAnnealingLR schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) - lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + logging.info('Using lightly cosine schedule, warmup for {} epochs, max for {} epochs'.format(self.warmup_epochs, self.max_cosine_epochs)) + # from lightly.utils.scheduler import CosineWarmupScheduler #copied from here to avoid dependency + # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers + # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. + lr_scheduler = CosineWarmupScheduler( optimizer=opt, - T_max=self.max_cosine_epochs, - eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + warmup_epochs=self.warmup_epochs, + max_epochs=self.max_cosine_epochs, + start_value=self.learning_rate, + end_value=self.learning_rate * self.max_learning_rate_reduction_factor, ) + # logging.info('Using CosineAnnealingLR schedule, warmup not supported, max for {} epochs'.format(self.max_cosine_epochs)) + # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + # optimizer=opt, + # T_max=self.max_cosine_epochs, + # eta_min=self.learning_rate * self.max_learning_rate_reduction_factor + # ) + return { "optimizer": opt, "lr_scheduler": { @@ -785,4 +788,146 @@ def download_from_name(class_name: str, hub_name: str, **kwargs): repo_id=repo_id, filename=f"{class_name}.ckpt" ) - return downloaded_loc \ No newline at end of file + return downloaded_loc + + + + +def cosine_schedule( + step: int, + max_steps: int, + start_value: float, + end_value: float, + period: Optional[int] = None, +) -> float: + """Use cosine decay to gradually modify start_value to reach target end_value during + iterations. + + Args: + step: + Current step number. + max_steps: + Total number of steps. + start_value: + Starting value. + end_value: + Target value. + period (optional): + The number of steps over which the cosine function completes a full cycle. + If not provided, it defaults to max_steps. + + Returns: + Cosine decay value. + + """ + if step < 0: + raise ValueError("Current step number can't be negative") + if max_steps < 1: + raise ValueError("Total step number must be >= 1") + if period is None and step > max_steps: + warnings.warn( + f"Current step number {step} exceeds max_steps {max_steps}.", + category=RuntimeWarning, + ) + if period is not None and period <= 0: + raise ValueError("Period must be >= 1") + + decay: float + if period is not None: # "cycle" based on period, if provided + decay = ( + end_value + - (end_value - start_value) * (np.cos(2 * np.pi * step / period) + 1) / 2 + ) + elif max_steps == 1: + # Avoid division by zero + decay = end_value + elif step == max_steps: + # Special case for Pytorch Lightning which updates LR scheduler also for epoch + # after last training epoch. + decay = end_value + else: + decay = ( + end_value + - (end_value - start_value) + * (np.cos(np.pi * step / (max_steps - 1)) + 1) + / 2 + ) + return decay + + +class CosineWarmupScheduler(torch.optim.lr_scheduler.LambdaLR): + """Cosine warmup scheduler for learning rate. + + Args: + optimizer: + Optimizer object to schedule the learning rate. + warmup_epochs: + Number of warmup epochs or steps. + max_epochs: + Total number of training epochs or steps. + last_epoch: + The index of last epoch or step. Default: -1 + start_value: + Starting learning rate scale. Default: 1.0 + end_value: + Target learning rate scale. Default: 0.001 + verbose: + If True, prints a message to stdout for each update. Default: False. + + Note: The `epoch` arguments do not necessarily have to be epochs. Any step or index + can be used. The naming follows the Pytorch convention to use `epoch` for the steps + in the scheduler. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + warmup_epochs: int, + max_epochs: int, + last_epoch: int = -1, + start_value: float = 1.0, + end_value: float = 0.001, + period: Optional[int] = None, + verbose: bool = False, + ) -> None: + self.warmup_epochs = warmup_epochs + self.max_epochs = max_epochs + self.start_value = start_value + self.end_value = end_value + self.period = period + super().__init__( + optimizer=optimizer, + lr_lambda=self.scale_lr, + last_epoch=last_epoch, + verbose=verbose, + ) + + def scale_lr(self, epoch: int) -> float: + """ + Scale learning rate according to the current epoch number. + + Args: + epoch: + Current epoch number. + + Returns: + Scaled learning rate. + + """ + if epoch < self.warmup_epochs: + return self.start_value * (epoch + 1) / self.warmup_epochs + elif self.period is not None: + return cosine_schedule( + step=epoch - self.warmup_epochs, + max_steps=1, + start_value=self.start_value, + end_value=self.end_value, + period=self.period, + ) + else: + return cosine_schedule( + step=epoch - self.warmup_epochs, + max_steps=self.max_epochs - self.warmup_epochs, + start_value=self.start_value, + end_value=self.end_value, + ) From 573ad69bf90a9eadd07fb604dd7a76df40ae6651 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Fri, 29 Mar 2024 22:18:37 -0400 Subject: [PATCH 285/307] add docstrings throughout --- zoobot/pytorch/training/finetune.py | 169 +++++++++++++++++++-------- zoobot/pytorch/training/losses.py | 2 +- zoobot/tensorflow/training/losses.py | 4 +- 3 files changed, 126 insertions(+), 49 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 52ff57bb..0c46443a 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -4,6 +4,7 @@ import warnings from functools import partial +import numpy as np import pytorch_lightning as pl from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint @@ -62,7 +63,8 @@ class FinetuneableZoobotAbstract(pl.LightningModule): cosine_schedule (bool, optional): Reduce the learning rate each epoch according to a cosine schedule, after warmup_epochs. Defaults to False. warmup_epochs (int, optional): Linearly increase the learning rate from 0 to `learning_rate` over the first `warmup_epochs` epochs, before applying cosine schedule. No effect if cosine_schedule=False. max_cosine_epochs (int, optional): Epochs for the scheduled learning rate to decay to final learning rate (below). Warmup epochs don't count. No effect if `cosine_schedule=False`. - max_learning_rate_reduction_factor (float, optional): + max_learning_rate_reduction_factor (float, optional): Set final learning rate as `learning_rate` * `max_learning_rate_reduction_factor`. No effect if `cosine_schedule=False`. + from_scratch (bool, optional): Ignore all settings above and train from scratch at `learning_rate` for all layers. Useful for a quick baseline. Defaults to False. prog_bar (bool, optional): Print progress bar during finetuning. Defaults to True. visualize_images (bool, optional): Upload example images to WandB. Good for debugging but slow. Defaults to False. seed (int, optional): random seed to use. Defaults to 42. @@ -134,12 +136,7 @@ def __init__( # work out encoder dim 'manually' self.encoder_dim = define_model.get_encoder_dim(self.encoder) - # for backwards compat. - if n_layers: - logging.warning('FinetuneableZoobot(n_layers) is now renamed to n_blocks, please update to pass n_blocks instead! For now, setting n_blocks=n_layers') - self.n_blocks = n_layers - else: - self.n_blocks = n_blocks + self.n_blocks = n_blocks self.learning_rate = learning_rate self.lr_decay = lr_decay @@ -394,20 +391,22 @@ class FinetuneableZoobotClassifier(FinetuneableZoobotAbstract): """ Pretrained Zoobot model intended for finetuning on a classification problem. - You must also pass either ``checkpoint_loc`` (to a saved encoder checkpoint) - or `encoder` (to a pytorch model already loaded in memory). - See :class:FinetuneableZoobotAbstract for more options. + Any args not listed below are passed to :class:``FinetuneableZoobotAbstract`` (for example, `learning_rate`). + These are shared between classifier, regressor, and tree models. + See the docstring of :class:``FinetuneableZoobotAbstract`` for more. - Any args not in the list below are passed to :class:``FinetuneableZoobotAbstract`` (usually to specify how to carry out the finetuning) + Models can be loaded in one of three ways: + - HuggingFace name e.g. FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. + - Any PyTorch model in memory e.g. FinetuneableZoobotClassifier(encoder=some_model, ...) + - ZoobotTree checkpoint e.g. FinetuneableZoobotClassifier(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) Args: num_classes (int): num. of target classes (e.g. 2 for binary classification). label_smoothing (float, optional): See torch cross_entropy_loss docs. Defaults to 0. + class_weights (arraylike, optional): See torch cross_entropy_loss docs. Defaults to None. """ - - def __init__( self, num_classes: int, @@ -508,10 +507,18 @@ class FinetuneableZoobotRegressor(FinetuneableZoobotAbstract): """ Pretrained Zoobot model intended for finetuning on a regression problem. - See FinetuneableZoobotClassifier, above + Any args not listed below are passed to :class:``FinetuneableZoobotAbstract`` (for example, `learning_rate`). + These are shared between classifier, regressor, and tree models. + See the docstring of :class:``FinetuneableZoobotAbstract`` for more. + + Models can be loaded in one of three ways: + - HuggingFace name e.g. FinetuneableZoobotRegressor(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. + - Any PyTorch model in memory e.g. FinetuneableZoobotRegressor(encoder=some_model, ...) + - ZoobotTree checkpoint e.g. FinetuneableZoobotRegressor(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + Args: - None besides those from FinetuneableZoobotAbstract, above (1 class, MSE error, for now) + unit_interval (bool, optional): If True, use sigmoid activation for the final layer, ensuring predictions between 0 and 1. Defaults to False. """ @@ -606,11 +613,23 @@ def predict_step(self, x: Union[list[torch.Tensor], torch.Tensor], batch_idx): class FinetuneableZoobotTree(FinetuneableZoobotAbstract): """ - Pretrained Zoobot model intended for finetuning on a decision tree (i.e. GZ-like) problem. + Pretrained Zoobot model intended for finetuning on a decision tree (i.e. GZ-like) problem. + Uses Dirichlet-Multinomial loss introduced in GZ DECaLS. + Briefly: predicts a Dirichlet distribution for the probability of a typical volunteer giving each answer, + and uses the Dirichlet-Multinomial loss to compare the predicted distribution of votes (given k volunteers were asked) to the true distribution. + + Does not produce accuracy or MSE metrics, as these are not relevant for this task. Loss logging only. - You must also pass either ``checkpoint_loc`` (to a saved encoder checkpoint) - or ``encoder`` (to a pytorch model already loaded in memory). - See :class:FinetuneableZoobotAbstract for more options. + If you're using this, you're probably working on a Galaxy Zoo catalog, and you should Slack Mike! + + Any args not listed below are passed to :class:``FinetuneableZoobotAbstract`` (for example, `learning_rate`). + These are shared between classifier, regressor, and tree models. + See the docstring of :class:``FinetuneableZoobotAbstract`` for more. + + Models can be loaded in one of three ways: + - HuggingFace name e.g. FinetuneableZoobotRegressor(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. + - Any PyTorch model in memory e.g. FinetuneableZoobotRegressor(encoder=some_model, ...) + - ZoobotTree checkpoint e.g. FinetuneableZoobotRegressor(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) Args: schema (schemas.Schema): description of the layout of the decision tree. See :class:`zoobot.shared.schemas.Schema`. @@ -639,16 +658,30 @@ def __init__( self.loss = define_model.get_dirichlet_loss_func(self.schema.question_index_groups) def upload_images_to_wandb(self, outputs, batch, batch_idx): - pass # not yet implemented + raise NotImplementedError # other functions are simply inherited from FinetunedZoobotAbstract -# https://github.com/inigoval/byol/blob/1da1bba7dc5cabe2b47956f9d7c6277decd16cc7/byol_main/networks/models.py#L29 class LinearHead(torch.nn.Module): - def __init__(self, input_dim, output_dim, dropout_prob=0.5, activation=None): + def __init__(self, input_dim: int, output_dim: int, dropout_prob=0.5, activation=None): + """ + Small utility class for a linear head with dropout and optional choice of activation. + + - Apply dropout to features before the final linear layer. + - Apply a final linear layer + - Optionally, apply `activation` callable + + Args: + input_dim (int): input dim of the linear layer (i.e. the encoder output dimension) + output_dim (int): output dim of the linear layer (often e.g. N for N classes, or 1 for regression) + dropout_prob (float, optional): Dropout probability. Defaults to 0.5. + activation (callable, optional): callable expecting tensor e.g. torch softmax. Defaults to None. + """ # input dim is representation dim, output_dim is num classes super(LinearHead, self).__init__() + self.input_dim = input_dim self.output_dim = output_dim + self.dropout = torch.nn.Dropout(p=dropout_prob) self.linear = torch.nn.Linear(input_dim, output_dim) self.activation = activation @@ -666,36 +699,54 @@ def forward(self, x): -def cross_entropy_loss(y_pred, y, label_smoothing=0., weight=None): - # y should be shape (batch) and ints - # y_pred should be shape (batch, classes) - # returns loss of shape (batch) - # will reduce myself +def cross_entropy_loss(y_pred: torch.Tensor, y: torch.Tensor, label_smoothing: float=0., weight=None): + """ + Calculate cross-entropy loss with optional label smoothing and class weights. No aggregation applied. + Trivial wrapper of torch.nn.functional.cross_entropy with reduction='none'. + + Args: + y_pred (torch.Tensor): ints of shape (batch) + y (torch.Tensor): predictions of shape (batch, classes) + label_smoothing (float, optional): See docstring of torch.nn.functional.cross_entropy. Defaults to 0.. + weight (arraylike, optional): See docstring of torch.nn.functional.cross_entropy. Defaults to None. + + Returns: + torch.Tensor: unreduced cross-entropy loss + """ return F.cross_entropy(y_pred, y.long(), label_smoothing=label_smoothing, weight=weight, reduction='none') + def mse_loss(y_pred, y): - # y should be shape (batch) and ints - # y_pred should be shape (batch, classes) - # returns loss of shape (batch) - # will reduce myself + """ + Trivial wrapper of torch.nn.functional.mse_loss with reduction='none'. + + Args: + y_pred (torch.Tensor): See docstring of torch.nn.functional.mse_loss. + y (torch.Tensor): See docstring of torch.nn.functional.mse_loss. + + Returns: + torch.Tensor: See docstring of torch.nn.functional.mse_loss. + """ return F.mse_loss(y_pred, y, reduction='none') -def dirichlet_loss(y_pred, y, question_index_groups): - # aggregation equiv. to sum(axis=1).mean(), but fewer operations - # returns loss of shape (batch) - # my func uses sklearn convention y, y_pred - return losses.calculate_multiquestion_loss(y, y_pred, question_index_groups).mean()*len(question_index_groups) +def dirichlet_loss(y_pred: torch.Tensor, y: torch.Tensor, question_index_groups): + """ + Calculate Dirichlet-Multinomial loss for a batch of predictions and labels. + Returns a scalar loss (ready for gradient descent) by summing across answers and taking a mean across the batch. + Reduction equivalent to sum(axis=1).mean(), but with fewer operations. + Args: + y_pred (torch.Tensor): Predicted dirichlet distribution, of shape (batch, answers) + y (torch.Tensor): Count of volunteer votes for each answer, of shape (batch, answers) + question_index_groups (list): Answer indices for each question i.e. [(question.start_index, question.end_index), ...] for all questions. Useful for slicing model predictions by question. See :ref:`schemas`. -class FinetunedZoobotClassifierBaseline(FinetuneableZoobotClassifier): - # exactly as the Finetuned model above, but with a simple single learning rate - # useful for training from-scratch model exactly as if it were finetuned, as a baseline + Returns: + torch.Tensor: Dirichlet-Multinomial loss. Scalar, summing across answers and taking a mean across the batch i.e. sum(axis=1).mean()) + """ + # my func uses sklearn convention y, y_pred + return losses.calculate_multiquestion_loss(y, y_pred, question_index_groups).mean()*len(question_index_groups) - def configure_optimizers(self): - head_params = list(self.head.parameters()) - encoder_params = list(self.encoder.parameters()) - return torch.optim.AdamW(head_params + encoder_params, lr=self.learning_rate) def load_pretrained_zoobot(checkpoint_loc: str) -> torch.nn.Module: @@ -726,9 +777,18 @@ def get_trainer( **trainer_kwargs ) -> pl.Trainer: """ - PyTorch Lightning Trainer that carries out the finetuning process. + Convenience wrapper to create a PyTorch Lightning Trainer that carries out the finetuning process. Use like so: trainer.fit(model, datamodule) + `get_trainer` args are for common Trainer settings e.g. early stopping checkpointing, etc. By default: + - Saves the top-k models based on validation loss + - Uses early stopping with `patience` i.e. end training if validation loss does not improve after `patience` epochs. + - Monitors the learning rate (useful when using a learning rate scheduler) + + Any extra args not listed below are passed directly to the PyTorch Lightning Trainer. + Use this to add any custom configuration not covered by the `get_trainer` args. + See https://lightning.ai/docs/pytorch/stable/common/trainer.html + Args: save_dir (str): folder in which to save checkpoints and logs. file_template (str, optional): custom naming for checkpoint files. See Lightning docs. Defaults to "{epoch}". @@ -776,7 +836,22 @@ def get_trainer( return trainer -def download_from_name(class_name: str, hub_name: str, **kwargs): +def download_from_name(class_name: str, hub_name: str): + """ + Download a finetuned model from the HuggingFace Hub by name. + Used to load pretrained Zoobot models by name, e.g. FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). + + Downloaded models are saved to the HuggingFace cache directory for later use (typically ~/.cache/huggingface). + + You shouldn't need to call this; it's used internally by the FinetuneableZoobot classes. + + Args: + class_name (str): one of FinetuneableZoobotClassifier, FinetuneableZoobotRegressor, FinetuneableZoobotTree + hub_name (str): e.g. mwalmsley/zoobot-encoder-convnext_nano + + Returns: + str: path to downloaded model (in HuggingFace cache directory). Likely then loaded by Lightning. + """ from huggingface_hub import hf_hub_download if hub_name.startswith('hf_hub:'): @@ -800,8 +875,10 @@ def cosine_schedule( end_value: float, period: Optional[int] = None, ) -> float: - """Use cosine decay to gradually modify start_value to reach target end_value during + """ + Use cosine decay to gradually modify start_value to reach target end_value during iterations. + Copied from lightly library (thank you for open sourcing) Args: step: diff --git a/zoobot/pytorch/training/losses.py b/zoobot/pytorch/training/losses.py index 77b15761..712b6846 100755 --- a/zoobot/pytorch/training/losses.py +++ b/zoobot/pytorch/training/losses.py @@ -15,7 +15,7 @@ def calculate_multiquestion_loss(labels: torch.Tensor, predictions: torch.Tensor Args: labels (torch.Tensor): (galaxy, k successes) where k successes dimension is indexed by question_index_groups. predictions (torch.Tensor): Dirichlet concentrations, matching shape of labels - question_index_groups (list): Paired (tuple) integers of (first, last) indices of answers to each question, listed for all questions. See :ref:`schemas`. + question_index_groups (list): Answer indices for each question i.e. [(question.start_index, question.end_index), ...] for all questions. Useful for slicing model predictions by question. See :ref:`schemas`. Returns: torch.Tensor: neg. log likelihood of shape (batch, question). diff --git a/zoobot/tensorflow/training/losses.py b/zoobot/tensorflow/training/losses.py index 12e5efa4..443ef117 100755 --- a/zoobot/tensorflow/training/losses.py +++ b/zoobot/tensorflow/training/losses.py @@ -12,7 +12,7 @@ def get_multiquestion_loss(question_index_groups, sum_over_questions=True, reduc tf.keras.losses.Reduction.SUM will simply add everything up, so divide by the global batch size externally with tf.reduce_sum Args: - question_index_groups (list): Answer indices for each question i.e. [(question.start_index, question.end_index), ...] for all questions. Useful for slicing model predictions by question. + question_index_groups (list): Answer indices for each question i.e. [(question.start_index, question.end_index), ...] for all questions. Useful for slicing model predictions by question. See :ref:`schemas`. Returns: MultiquestionLoss: see above. @@ -36,7 +36,7 @@ def calculate_multiquestion_loss(labels, predictions, question_index_groups, sum Args: labels (tf.Tensor): (galaxy, k successes) where k successes dimension is indexed by question_index_groups. predictions (tf.Tensor): Dirichlet concentrations, matching shape of labels - question_index_groups (list): Paired (tuple) integers of (first, last) indices of answers to each question, listed for all questions. + question_index_groups (list): Answer indices for each question i.e. [(question.start_index, question.end_index), ...] for all questions. Useful for slicing model predictions by question. See :ref:`schemas`. Returns: tf.Tensor: neg. log likelihood of shape (batch, question). From a588d72e2484e97825cdefe3b534e8ffeeca567b Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 30 Mar 2024 10:07:19 -0400 Subject: [PATCH 286/307] num_classes=0 --- setup.py | 14 +++++++------- tests/pytorch/test_finetune_classifier.py | 0 zoobot/pytorch/training/finetune.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 tests/pytorch/test_finetune_classifier.py diff --git a/setup.py b/setup.py index 428e7172..157f3f19 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ 'torch == 2.1.0+cpu', 'torchvision == 0.16.0+cpu', 'torchaudio >= 2.1.0', - 'pytorch-lightning >= 2.0.0', + 'lightning >= 2.0.0', # 'simplejpeg', 'albumentations', 'pyro-ppl >= 1.8.6', @@ -41,7 +41,7 @@ 'torch == 2.1.0', 'torchvision == 0.16.0', 'torchaudio >= 2.1.0', - 'pytorch-lightning >= 2.0.0', + 'lightning >= 2.0.0', 'albumentations', 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', @@ -54,7 +54,7 @@ 'torch == 2.1.0+cu118', 'torchvision == 0.16.0+cu118', 'torchaudio >= 2.1.0', - 'pytorch-lightning >= 2.0.0', + 'lightning >= 2.0.0', 'albumentations', 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', @@ -64,7 +64,7 @@ 'torch == 2.1.0+cu121', 'torchvision == 0.16.0+cu121', 'torchaudio >= 2.1.0', - 'pytorch-lightning >= 2.0.0', + 'lightning >= 2.0.0', 'albumentations', 'pyro-ppl >= 1.8.6', 'torchmetrics == 0.11.0', @@ -72,14 +72,14 @@ ], 'pytorch-colab': [ # colab includes pytorch already - 'pytorch-lightning >= 2.0.0', + 'lightning >= 2.0.0', 'albumentations', 'pyro-ppl>=1.8.0', 'torchmetrics==0.11.0', - 'timm == 0.9.10' + 'timm >= 0.9.10' ], # TODO may add narval/Digital Research Canada config - 'tensorflow': [ + 'tensorflow': [ # WARNING now deprecated 'tensorflow == 2.10.0', # 2.11.0 turns on XLA somewhere which then fails on multi-GPU...TODO 'keras_applications', 'tensorflow_probability == 0.18.0', # 0.19 requires tf 2.11 diff --git a/tests/pytorch/test_finetune_classifier.py b/tests/pytorch/test_finetune_classifier.py new file mode 100644 index 00000000..e69de29b diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 0c46443a..6e9fae12 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -122,7 +122,7 @@ def __init__( if name is not None: assert encoder is None, 'Cannot pass both name and encoder to use' - self.encoder = timm.create_model(name, pretrained=True) + self.encoder = timm.create_model(name, num_classes=0, pretrained=True) self.encoder_dim = self.encoder.num_features elif zoobot_checkpoint_loc is not None: From a7bba6d942e864966638ea7a2430b658289e88ab Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 30 Mar 2024 13:44:01 -0400 Subject: [PATCH 287/307] tinker to set cluster vars --- zoobot/pytorch/manchester.py | 75 +++++++++++-------- .../training/train_with_pytorch_lightning.py | 17 ++--- 2 files changed, 52 insertions(+), 40 deletions(-) diff --git a/zoobot/pytorch/manchester.py b/zoobot/pytorch/manchester.py index 2ed39e6a..f1f966ad 100644 --- a/zoobot/pytorch/manchester.py +++ b/zoobot/pytorch/manchester.py @@ -1,44 +1,57 @@ import logging +import os -from lightning_lite.plugins.environments import SLURMEnvironment -# https://pytorch-lightning.readthedocs.io/en/stable/_modules/lightning_lite/plugins/environments/slurm.html#SLURMEnvironment -# https://github.com/Lightning-AI/lightning/blob/9c20cad40e4142f8a5e945fe26e919e598f2bd56/src/lightning_lite/plugins/environments/slurm.py -class ManchesterEnvironment(SLURMEnvironment): +# from lightning_lite.plugins.environments import SLURMEnvironment - def __init__(self, auto_requeue: bool = True, requeue_signal=None) -> None: - logging.info('Using Manchester SLURM environment') - super().__init__(auto_requeue, requeue_signal) +# # https://pytorch-lightning.readthedocs.io/en/stable/_modules/lightning_lite/plugins/environments/slurm.html#SLURMEnvironment +# # https://github.com/Lightning-AI/lightning/blob/9c20cad40e4142f8a5e945fe26e919e598f2bd56/src/lightning_lite/plugins/environments/slurm.py +# class ManchesterEnvironment(SLURMEnvironment): - # @staticmethod - # def resolve_root_node_address(nodes: str) -> str: - # root_node_address = super().resolve_root_node_address(nodes) - # logging.info(f'root_node_address: {root_node_address}') - # return root_node_address +# def __init__(self, auto_requeue: bool = True, requeue_signal=None) -> None: +# logging.info('Using Manchester SLURM environment') +# super().__init__(auto_requeue, requeue_signal) - @staticmethod - def detect() -> bool: - return True +# # @staticmethod +# # def resolve_root_node_address(nodes: str) -> str: +# # root_node_address = super().resolve_root_node_address(nodes) +# # logging.info(f'root_node_address: {root_node_address}') +# # return root_node_address + +# @staticmethod +# def detect() -> bool: +# return True - @property - def main_port(self) -> int: - main_port = super().main_port - logging.info(f'main_port: {main_port}') - return main_port - # MASTER_PORT will override +# @property +# def main_port(self) -> int: +# main_port = super().main_port +# logging.info(f'main_port: {main_port}') +# return main_port +# # MASTER_PORT will override + + +from pytorch_lightning.plugins.environments import SLURMEnvironment +class GalahadEnvironment(SLURMEnvironment): + def __init__(self, **kwargs): + ntasks_per_node = os.environ["SLURM_TASKS_PER_NODE"].split("(")[0] + os.environ["SLURM_NTASKS_PER_NODE"] = ntasks_per_node + # os.environ["SLURM_NTASKS"] = str(os.environ["SLURM_NTASKS_PER_NODE"]) + super().__init__(**kwargs) + self.nnodes = int(os.environ["SLURM_NNODES"]) + -if __name__ == '__main__': +# if __name__ == '__main__': - logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.INFO) - # slurm_nodelist = "compute-0-[0,9]" # 0,9 works - slurm_nodelist = "compute-0-[0,11]" # 0,11 hangs - # 70017 8-9 works +# # slurm_nodelist = "compute-0-[0,9]" # 0,9 works +# slurm_nodelist = "compute-0-[0,11]" # 0,11 hangs +# # 70017 8-9 works - env = ManchesterEnvironment() - root = env.resolve_root_node_address(slurm_nodelist) - print(root) +# env = GalahadEnvironment() +# root = env.resolve_root_node_address(slurm_nodelist) +# print(root) - print(env.detect()) +# print(env.detect()) - print(env.main_port) \ No newline at end of file +# print(env.main_port) \ No newline at end of file diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 728ac751..344e81d2 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -135,19 +135,18 @@ def train_default_zoobot_from_scratch( if (gpus is not None) and (gpus > 1): strategy = DDPStrategy(find_unused_parameters=False) # static_graph=True TODO logging.info('Using multi-gpu training') - if nodes > 1: # I assume nobody is doing multi-node cpu training... - logging.info('Using multi-node training') # purely for your info + # if nodes > 1: # I assume nobody is doing multi-node cpu training... + # logging.info('Using multi-node training') # purely for your info # this is only needed for multi-node training # our cluster sets TASKS_PER_NODE not NTASKS_PER_NODE # (with srun, SLURM_STEP_TASKS_PER_NODE) # https://slurm.schedmd.com/srun.html#OPT_SLURM_STEP_TASKS_PER_NODE - if 'SLURM_NTASKS_PER_NODE' not in os.environ.keys(): - os.environ['SLURM_NTASKS_PER_NODE'] = os.environ['SLURM_TASKS_PER_NODE'] - # from lightning_lite.plugins.environments import SLURMEnvironment - from zoobot.pytorch import manchester - logging.warning('Using custom slurm environment') - # https://pytorch-lightning.readthedocs.io/en/stable/clouds/cluster_advanced.html#enable-auto-wall-time-resubmitions - plugins = [manchester.ManchesterEnvironment(auto_requeue=False)] + if 'SLURM_NTASKS_PER_NODE' not in os.environ.keys(): + os.environ['SLURM_NTASKS_PER_NODE'] = os.environ['SLURM_TASKS_PER_NODE'] + from zoobot.pytorch import manchester + logging.warning('Using custom slurm environment') + # https://pytorch-lightning.readthedocs.io/en/stable/clouds/cluster_advanced.html#enable-auto-wall-time-resubmitions + plugins = [manchester.GalahadEnvironment(auto_requeue=False)] if gpus > 0: accelerator = 'gpu' From 0732853b4f6e34ec21e5f142f483d6ecc278ab0e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sat, 30 Mar 2024 16:55:15 -0400 Subject: [PATCH 288/307] require galaxy-datasets --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 157f3f19..9c50ad2a 100755 --- a/setup.py +++ b/setup.py @@ -76,7 +76,8 @@ 'albumentations', 'pyro-ppl>=1.8.0', 'torchmetrics==0.11.0', - 'timm >= 0.9.10' + 'timm >= 0.9.10', + 'galaxy_datasets == 0.0.17' ], # TODO may add narval/Digital Research Canada config 'tensorflow': [ # WARNING now deprecated From 0bffcb5647b39eff038d497017c3280aaed8f745 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 11:03:59 -0400 Subject: [PATCH 289/307] tiny tweak --- zoobot/pytorch/predictions/predict_on_catalog.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zoobot/pytorch/predictions/predict_on_catalog.py b/zoobot/pytorch/predictions/predict_on_catalog.py index 0b99270f..918593c2 100644 --- a/zoobot/pytorch/predictions/predict_on_catalog.py +++ b/zoobot/pytorch/predictions/predict_on_catalog.py @@ -85,3 +85,5 @@ def predict(catalog: pd.DataFrame, model: pl.LightningModule, n_samples: int, la end = datetime.datetime.fromtimestamp(time.time()) logging.info('Completed at: {}'.format(end.strftime('%Y-%m-%d %H:%M:%S'))) logging.info('Time elapsed: {}'.format(end - start)) + + return predictions From d2e415aedb72cc3b60b1b236d2caad37319130ec Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 12:12:26 -0400 Subject: [PATCH 290/307] add mae support --- zoobot/pytorch/training/finetune.py | 40 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 6e9fae12..2bae0a60 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -518,13 +518,15 @@ class FinetuneableZoobotRegressor(FinetuneableZoobotAbstract): Args: + loss (str, optional): Loss function to use. Must be one of 'mse', 'mae'. Defaults to 'mse'. unit_interval (bool, optional): If True, use sigmoid activation for the final layer, ensuring predictions between 0 and 1. Defaults to False. """ def __init__( self, - unit_interval=False, + loss:str='mse', + unit_interval:bool=False, **super_kwargs) -> None: super().__init__(**super_kwargs) @@ -543,7 +545,13 @@ def __init__( dropout_prob=self.dropout_prob, activation=head_activation ) - self.loss = mse_loss + if loss in ['mse', 'mean_squared_error']: + self.loss = mse_loss + elif loss in ['mae', 'mean_absolute_error', 'l1', 'l1_loss']: + self.loss = l1_loss + else: + raise ValueError(f'Loss {loss} not recognised. Must be one of mse, mae') + # rmse metrics. loss is mse already. self.train_rmse = tm.MeanSquaredError(squared=False) self.val_rmse = tm.MeanSquaredError(squared=False) @@ -595,21 +603,6 @@ def predict_step(self, x: Union[list[torch.Tensor], torch.Tensor], batch_idx): return self(x[0]) return self.forward(x) - # TODO - # def upload_images_to_wandb(self, outputs, batch, batch_idx): - # # self.logger is set by pl.Trainer(logger=) argument - # if (self.logger is not None) and (batch_idx == 0): - # x, y = batch - # y_pred_softmax = F.softmax(outputs['predictions'], dim=1) - # n_images = 5 - # images = [img for img in x[:n_images]] - # captions = [f'Ground Truth: {y_i} \nPrediction: {y_p_i}' for y_i, y_p_i in zip( - # y[:n_images], y_pred_softmax[:n_images])] - # self.logger.log_image( # type: ignore - # key='val_images', - # images=images, - # caption=captions) - class FinetuneableZoobotTree(FinetuneableZoobotAbstract): """ @@ -729,6 +722,19 @@ def mse_loss(y_pred, y): """ return F.mse_loss(y_pred, y, reduction='none') +def l1_loss(y_pred, y): + """ + Trivial wrapper of torch.nn.functional.l1_loss with reduction='none'. + + Args: + y_pred (torch.Tensor): See docstring of torch.nn.functional.l1_loss. + y (torch.Tensor): See docstring of torch.nn.functional.l1_loss. + + Returns: + torch.Tensor: See docstring of torch.nn.functional.l1_loss. + """ + return F.l1_loss(y_pred, y, reduction='none') + def dirichlet_loss(y_pred: torch.Tensor, y: torch.Tensor, question_index_groups): """ From 94ad1ee1ec9efe94e3cf105483e67c0d020284ac Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 12:52:09 -0400 Subject: [PATCH 291/307] small note --- zoobot/pytorch/estimators/define_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 8670ba0e..3ecc30e0 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -309,6 +309,8 @@ def configure_optimizers(self): patience=self.scheduler_params.get('patience', 5) ) return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'validation/loss'} + # TODO add cosine scheduler support here, same args as FinetuneableZoobot + # work on this for big model sweep else: logging.info('No scheduler used') return optimizer # no scheduler From b1fbe2e03aea0f577aeaa01a599c21c54147cdcc Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 14:33:52 -0400 Subject: [PATCH 292/307] typo --- zoobot/pytorch/training/representations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/representations.py b/zoobot/pytorch/training/representations.py index 09b379a4..a350241b 100644 --- a/zoobot/pytorch/training/representations.py +++ b/zoobot/pytorch/training/representations.py @@ -7,6 +7,7 @@ class ZoobotEncoder(pl.LightningModule): def __init__(self, encoder): + super().__init__() logging.info('ZoobotEncoder: using provided in-memory encoder') self.encoder = encoder # plain pytorch module e.g. Sequential @@ -26,6 +27,6 @@ def load_from_name(cls, name: str): Returns: nn.Module: timm model """ - timm_model = create_model(name) + timm_model = create_model(name, pretrained=True) return cls(timm_model) \ No newline at end of file From 2fd8775406044bb88413646a7d80763af95f8a6e Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 14:45:03 -0400 Subject: [PATCH 293/307] change color->greyscale for consistency --- zoobot/pytorch/datasets/webdatamodule.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zoobot/pytorch/datasets/webdatamodule.py b/zoobot/pytorch/datasets/webdatamodule.py index 27dd586d..3156d145 100644 --- a/zoobot/pytorch/datasets/webdatamodule.py +++ b/zoobot/pytorch/datasets/webdatamodule.py @@ -25,7 +25,7 @@ def __init__( num_workers=4, prefetch_factor=4, cache_dir=None, - color=False, + greyscale=False, crop_scale_bounds=(0.7, 0.8), crop_ratio_bounds=(0.9, 1.1), resize_after_crop=224, @@ -58,7 +58,7 @@ def __init__( self.cache_dir = cache_dir # could use mixin - self.color = color + self.greyscale = greyscale self.resize_after_crop = resize_after_crop self.crop_scale_bounds = crop_scale_bounds self.crop_ratio_bounds = crop_ratio_bounds @@ -79,7 +79,7 @@ def make_image_transform(self, mode="train"): crop_scale_bounds=self.crop_scale_bounds, crop_ratio_bounds=self.crop_ratio_bounds, resize_after_crop=self.resize_after_crop, - pytorch_greyscale=not self.color, + pytorch_greyscale=self.greyscale, to_float=False # True was wrong, webdataset rgb decoder already converts to 0-1 float # TODO now changed on dev branch will be different for new model training runs ) # A.Compose object From 5338b8168d99fee8958fba3f84bfac737d970e75 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 16:00:37 -0400 Subject: [PATCH 294/307] add cosine scheduler option --- zoobot/pytorch/estimators/define_model.py | 14 +++- zoobot/pytorch/training/finetune.py | 81 +---------------------- zoobot/pytorch/training/schedulers.py | 77 +++++++++++++++++++++ 3 files changed, 90 insertions(+), 82 deletions(-) create mode 100644 zoobot/pytorch/training/schedulers.py diff --git a/zoobot/pytorch/estimators/define_model.py b/zoobot/pytorch/estimators/define_model.py index 3ecc30e0..f5980492 100755 --- a/zoobot/pytorch/estimators/define_model.py +++ b/zoobot/pytorch/estimators/define_model.py @@ -10,7 +10,7 @@ from zoobot.shared import schemas from zoobot.pytorch.estimators import efficientnet_custom, custom_layers -from zoobot.pytorch.training import losses +from zoobot.pytorch.training import losses, schedulers # overall strategy # timm for defining complicated pytorch modules @@ -309,8 +309,16 @@ def configure_optimizers(self): patience=self.scheduler_params.get('patience', 5) ) return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'validation/loss'} - # TODO add cosine scheduler support here, same args as FinetuneableZoobot - # work on this for big model sweep + elif self.scheduler_params.get('cosine_schedule', False): + logging.info('Using cosine schedule') + scheduler = schedulers.CosineWarmupScheduler( + optimizer=optimizer, + warmup_epochs=self.scheduler_params['warmup_epochs'], + max_epochs=self.scheduler_params['max_cosine_epochs'], + start_value=self.learning_rate, + end_value=self.learning_rate * self.scheduler_params['max_learning_rate_reduction_factor'] + ) + return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'validation/loss'} else: logging.info('No scheduler used') return optimizer # no scheduler diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 2bae0a60..825a4235 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -15,7 +15,7 @@ import torchmetrics as tm import timm -from zoobot.pytorch.training import losses +from zoobot.pytorch.training import losses, schedulers from zoobot.pytorch.estimators import define_model from zoobot.shared import schemas @@ -268,7 +268,7 @@ def configure_optimizers(self): # from lightly.utils.scheduler import CosineWarmupScheduler #copied from here to avoid dependency # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers # Dictionary, with an "optimizer" key, and (optionally) a "lr_scheduler" key whose value is a single LR scheduler or lr_scheduler_config. - lr_scheduler = CosineWarmupScheduler( + lr_scheduler = schedulers.CosineWarmupScheduler( optimizer=opt, warmup_epochs=self.warmup_epochs, max_epochs=self.max_cosine_epochs, @@ -937,80 +937,3 @@ def cosine_schedule( ) return decay - -class CosineWarmupScheduler(torch.optim.lr_scheduler.LambdaLR): - """Cosine warmup scheduler for learning rate. - - Args: - optimizer: - Optimizer object to schedule the learning rate. - warmup_epochs: - Number of warmup epochs or steps. - max_epochs: - Total number of training epochs or steps. - last_epoch: - The index of last epoch or step. Default: -1 - start_value: - Starting learning rate scale. Default: 1.0 - end_value: - Target learning rate scale. Default: 0.001 - verbose: - If True, prints a message to stdout for each update. Default: False. - - Note: The `epoch` arguments do not necessarily have to be epochs. Any step or index - can be used. The naming follows the Pytorch convention to use `epoch` for the steps - in the scheduler. - """ - - def __init__( - self, - optimizer: torch.optim.Optimizer, - warmup_epochs: int, - max_epochs: int, - last_epoch: int = -1, - start_value: float = 1.0, - end_value: float = 0.001, - period: Optional[int] = None, - verbose: bool = False, - ) -> None: - self.warmup_epochs = warmup_epochs - self.max_epochs = max_epochs - self.start_value = start_value - self.end_value = end_value - self.period = period - super().__init__( - optimizer=optimizer, - lr_lambda=self.scale_lr, - last_epoch=last_epoch, - verbose=verbose, - ) - - def scale_lr(self, epoch: int) -> float: - """ - Scale learning rate according to the current epoch number. - - Args: - epoch: - Current epoch number. - - Returns: - Scaled learning rate. - - """ - if epoch < self.warmup_epochs: - return self.start_value * (epoch + 1) / self.warmup_epochs - elif self.period is not None: - return cosine_schedule( - step=epoch - self.warmup_epochs, - max_steps=1, - start_value=self.start_value, - end_value=self.end_value, - period=self.period, - ) - else: - return cosine_schedule( - step=epoch - self.warmup_epochs, - max_steps=self.max_epochs - self.warmup_epochs, - start_value=self.start_value, - end_value=self.end_value, - ) diff --git a/zoobot/pytorch/training/schedulers.py b/zoobot/pytorch/training/schedulers.py new file mode 100644 index 00000000..aa67775b --- /dev/null +++ b/zoobot/pytorch/training/schedulers.py @@ -0,0 +1,77 @@ + +class CosineWarmupScheduler(torch.optim.lr_scheduler.LambdaLR): + """Cosine warmup scheduler for learning rate. + + Args: + optimizer: + Optimizer object to schedule the learning rate. + warmup_epochs: + Number of warmup epochs or steps. + max_epochs: + Total number of training epochs or steps. + last_epoch: + The index of last epoch or step. Default: -1 + start_value: + Starting learning rate scale. Default: 1.0 + end_value: + Target learning rate scale. Default: 0.001 + verbose: + If True, prints a message to stdout for each update. Default: False. + + Note: The `epoch` arguments do not necessarily have to be epochs. Any step or index + can be used. The naming follows the Pytorch convention to use `epoch` for the steps + in the scheduler. + """ + + def __init__( + self, + optimizer: torch.optim.Optimizer, + warmup_epochs: int, + max_epochs: int, + last_epoch: int = -1, + start_value: float = 1.0, + end_value: float = 0.001, + period: Optional[int] = None, + verbose: bool = False, + ) -> None: + self.warmup_epochs = warmup_epochs + self.max_epochs = max_epochs + self.start_value = start_value + self.end_value = end_value + self.period = period + super().__init__( + optimizer=optimizer, + lr_lambda=self.scale_lr, + last_epoch=last_epoch, + verbose=verbose, + ) + + def scale_lr(self, epoch: int) -> float: + """ + Scale learning rate according to the current epoch number. + + Args: + epoch: + Current epoch number. + + Returns: + Scaled learning rate. + + """ + if epoch < self.warmup_epochs: + return self.start_value * (epoch + 1) / self.warmup_epochs + elif self.period is not None: + return cosine_schedule( + step=epoch - self.warmup_epochs, + max_steps=1, + start_value=self.start_value, + end_value=self.end_value, + period=self.period, + ) + else: + return cosine_schedule( + step=epoch - self.warmup_epochs, + max_steps=self.max_epochs - self.warmup_epochs, + start_value=self.start_value, + end_value=self.end_value, + ) From 7eb780aca270a4a8679f320c9186ca503f33beb3 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 16:16:21 -0400 Subject: [PATCH 295/307] imports --- zoobot/pytorch/training/finetune.py | 67 ------------------------- zoobot/pytorch/training/schedulers.py | 70 +++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 67 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 825a4235..76f73990 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -870,70 +870,3 @@ def download_from_name(class_name: str, hub_name: str): filename=f"{class_name}.ckpt" ) return downloaded_loc - - - - -def cosine_schedule( - step: int, - max_steps: int, - start_value: float, - end_value: float, - period: Optional[int] = None, -) -> float: - """ - Use cosine decay to gradually modify start_value to reach target end_value during - iterations. - Copied from lightly library (thank you for open sourcing) - - Args: - step: - Current step number. - max_steps: - Total number of steps. - start_value: - Starting value. - end_value: - Target value. - period (optional): - The number of steps over which the cosine function completes a full cycle. - If not provided, it defaults to max_steps. - - Returns: - Cosine decay value. - - """ - if step < 0: - raise ValueError("Current step number can't be negative") - if max_steps < 1: - raise ValueError("Total step number must be >= 1") - if period is None and step > max_steps: - warnings.warn( - f"Current step number {step} exceeds max_steps {max_steps}.", - category=RuntimeWarning, - ) - if period is not None and period <= 0: - raise ValueError("Period must be >= 1") - - decay: float - if period is not None: # "cycle" based on period, if provided - decay = ( - end_value - - (end_value - start_value) * (np.cos(2 * np.pi * step / period) + 1) / 2 - ) - elif max_steps == 1: - # Avoid division by zero - decay = end_value - elif step == max_steps: - # Special case for Pytorch Lightning which updates LR scheduler also for epoch - # after last training epoch. - decay = end_value - else: - decay = ( - end_value - - (end_value - start_value) - * (np.cos(np.pi * step / (max_steps - 1)) + 1) - / 2 - ) - return decay - diff --git a/zoobot/pytorch/training/schedulers.py b/zoobot/pytorch/training/schedulers.py index aa67775b..c93f844f 100644 --- a/zoobot/pytorch/training/schedulers.py +++ b/zoobot/pytorch/training/schedulers.py @@ -1,3 +1,73 @@ +import warnings + +import torch +import numpy as np +from typing import Optional + + +def cosine_schedule( + step: int, + max_steps: int, + start_value: float, + end_value: float, + period: Optional[int] = None, +) -> float: + """ + Use cosine decay to gradually modify start_value to reach target end_value during + iterations. + Copied from lightly library (thank you for open sourcing) + + Args: + step: + Current step number. + max_steps: + Total number of steps. + start_value: + Starting value. + end_value: + Target value. + period (optional): + The number of steps over which the cosine function completes a full cycle. + If not provided, it defaults to max_steps. + + Returns: + Cosine decay value. + + """ + if step < 0: + raise ValueError("Current step number can't be negative") + if max_steps < 1: + raise ValueError("Total step number must be >= 1") + if period is None and step > max_steps: + warnings.warn( + f"Current step number {step} exceeds max_steps {max_steps}.", + category=RuntimeWarning, + ) + if period is not None and period <= 0: + raise ValueError("Period must be >= 1") + + decay: float + if period is not None: # "cycle" based on period, if provided + decay = ( + end_value + - (end_value - start_value) * (np.cos(2 * np.pi * step / period) + 1) / 2 + ) + elif max_steps == 1: + # Avoid division by zero + decay = end_value + elif step == max_steps: + # Special case for Pytorch Lightning which updates LR scheduler also for epoch + # after last training epoch. + decay = end_value + else: + decay = ( + end_value + - (end_value - start_value) + * (np.cos(np.pi * step / (max_steps - 1)) + 1) + / 2 + ) + return decay + class CosineWarmupScheduler(torch.optim.lr_scheduler.LambdaLR): """Cosine warmup scheduler for learning rate. From 6657c559ce3000ddb28de0894ae1bd269a088e6f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 16:26:21 -0400 Subject: [PATCH 296/307] typo --- zoobot/pytorch/training/train_with_pytorch_lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 344e81d2..34b3c50d 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -255,7 +255,7 @@ def train_default_zoobot_from_scratch( prefetch_factor=prefetch_factor, cache_dir=cache_dir, # augmentation args - color=color, + greyscale=not color, crop_scale_bounds=crop_scale_bounds, crop_ratio_bounds=crop_ratio_bounds, resize_after_crop=resize_after_crop, From cda966f30c9dcad058492ca63890856a5a628e57 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Sun, 31 Mar 2024 16:43:39 -0400 Subject: [PATCH 297/307] check tasks per node --- zoobot/pytorch/manchester.py | 1 + zoobot/pytorch/training/train_with_pytorch_lightning.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/zoobot/pytorch/manchester.py b/zoobot/pytorch/manchester.py index f1f966ad..dfab82fd 100644 --- a/zoobot/pytorch/manchester.py +++ b/zoobot/pytorch/manchester.py @@ -35,6 +35,7 @@ class GalahadEnvironment(SLURMEnvironment): def __init__(self, **kwargs): ntasks_per_node = os.environ["SLURM_TASKS_PER_NODE"].split("(")[0] os.environ["SLURM_NTASKS_PER_NODE"] = ntasks_per_node + logging.warning(f'Within custom slurm environment, --n-tasks-per-node={ntasks_per_node}') # os.environ["SLURM_NTASKS"] = str(os.environ["SLURM_NTASKS_PER_NODE"]) super().__init__(**kwargs) self.nnodes = int(os.environ["SLURM_NNODES"]) diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 34b3c50d..9e22b508 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -144,7 +144,7 @@ def train_default_zoobot_from_scratch( if 'SLURM_NTASKS_PER_NODE' not in os.environ.keys(): os.environ['SLURM_NTASKS_PER_NODE'] = os.environ['SLURM_TASKS_PER_NODE'] from zoobot.pytorch import manchester - logging.warning('Using custom slurm environment') + logging.warning(f'Using custom slurm environment, --n-tasks-per-node={os.environ["SLURM_NTASKS_PER_NODE"]}') # https://pytorch-lightning.readthedocs.io/en/stable/clouds/cluster_advanced.html#enable-auto-wall-time-resubmitions plugins = [manchester.GalahadEnvironment(auto_requeue=False)] From 1fa4c0874b66a6925a9f886a181cd0bda4d18d98 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Tue, 2 Apr 2024 20:41:27 -0400 Subject: [PATCH 298/307] docs pass --- README.md | 6 +- docs/autodoc/api.rst | 15 ---- docs/autodoc/pytorch/training/finetune.rst | 18 ++++- docs/autodoc/shared/schemas.rst | 1 - docs/autodoc/tensorflow.rst | 27 ------- .../tensorflow/estimators/define_model.rst | 17 ---- .../estimators/efficientnet_custom.rst | 9 --- .../predictions/predict_on_dataset.rst | 10 --- docs/autodoc/tensorflow/training/finetune.rst | 11 --- docs/autodoc/tensorflow/training/losses.rst | 19 ----- .../tensorflow/training/train_with_keras.rst | 6 -- .../tensorflow/training/training_config.rst | 12 --- docs/guides/advanced_finetuning.rst | 62 +++++++-------- docs/guides/finetuning.rst | 2 +- docs/guides/guides.rst | 8 +- docs/guides/how_the_code_fits_together.rst | 78 +++++++++---------- docs/guides/loading_data.rst | 52 +++++++++++++ docs/guides/pytorch_or_tensorflow.rst | 40 ---------- docs/index.rst | 34 ++++---- zoobot/pytorch/training/finetune.py | 40 +++++----- .../training/train_with_pytorch_lightning.py | 22 ++++-- zoobot/shared/schemas.py | 18 +++++ 22 files changed, 210 insertions(+), 297 deletions(-) delete mode 100755 docs/autodoc/api.rst delete mode 100644 docs/autodoc/tensorflow.rst delete mode 100755 docs/autodoc/tensorflow/estimators/define_model.rst delete mode 100755 docs/autodoc/tensorflow/estimators/efficientnet_custom.rst delete mode 100755 docs/autodoc/tensorflow/predictions/predict_on_dataset.rst delete mode 100644 docs/autodoc/tensorflow/training/finetune.rst delete mode 100755 docs/autodoc/tensorflow/training/losses.rst delete mode 100644 docs/autodoc/tensorflow/training/train_with_keras.rst delete mode 100755 docs/autodoc/tensorflow/training/training_config.rst create mode 100644 docs/guides/loading_data.rst delete mode 100644 docs/guides/pytorch_or_tensorflow.rst diff --git a/README.md b/README.md index f74e7e1b..f919f978 100755 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Zoobot is trained using millions of answers by Galaxy Zoo volunteers. This code ## Installation -You can retrain Zoobot in the cloud with a free GPU using this [Google Colab notebook](https://colab.research.google.com/drive/17bb_KbA2J6yrIm4p4Ue_lEBHMNC1I9Jd?usp=sharing). To install locally, keep reading. +You can retrain Zoobot in the cloud with a free GPU using this [Google Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing). To install locally, keep reading. Download the code using git: @@ -49,7 +49,7 @@ I share my install steps [here](#install_cuda). GPUs are optional - Zoobot will ## Quickstart -The [Colab notebook](https://colab.research.google.com/drive/17bb_KbA2J6yrIm4p4Ue_lEBHMNC1I9Jd?usp=sharing) is the quickest way to get started. Alternatively, the minimal example below illustrates how Zoobot works. +The [Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing) is the quickest way to get started. Alternatively, the minimal example below illustrates how Zoobot works. Let's say you want to find ringed galaxies and you have a small labelled dataset of 500 ringed or not-ringed galaxies. You can retrain Zoobot to find rings like so: @@ -97,7 +97,7 @@ Zoobot includes many guides and working examples - see the [Getting Started](#ge ## Getting Started -I suggest starting with the [Colab notebook](https://colab.research.google.com/drive/17bb_KbA2J6yrIm4p4Ue_lEBHMNC1I9Jd?usp=sharing) or the worked examples below, which you can copy and adapt. +I suggest starting with the [Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing) or the worked examples below, which you can copy and adapt. For context and explanation, see the [documentation](https://zoobot.readthedocs.io/). diff --git a/docs/autodoc/api.rst b/docs/autodoc/api.rst deleted file mode 100755 index e60ef207..00000000 --- a/docs/autodoc/api.rst +++ /dev/null @@ -1,15 +0,0 @@ - -API -==== - -We encourage you to explore the code directly. -There are many comments (and commented-out examples) which might be helpful. -However, for convenience, you can check the docstrings directly here. - - -.. toctree:: - :maxdepth: 2 - - pytorch - tensorflow - shared diff --git a/docs/autodoc/pytorch/training/finetune.rst b/docs/autodoc/pytorch/training/finetune.rst index a23e767c..bd8a261c 100644 --- a/docs/autodoc/pytorch/training/finetune.rst +++ b/docs/autodoc/pytorch/training/finetune.rst @@ -7,6 +7,7 @@ See the `README `_ for a minimal example. See zoobot/pytorch/examples for more worked examples. .. autoclass:: zoobot.pytorch.training.finetune.FinetuneableZoobotAbstract + :members: configure_optimizers | @@ -14,12 +15,27 @@ See zoobot/pytorch/examples for more worked examples. | +.. autoclass:: zoobot.pytorch.training.finetune.FinetuneableZoobotRegressor + +| + .. autoclass:: zoobot.pytorch.training.finetune.FinetuneableZoobotTree | +.. autoclass:: zoobot.pytorch.training.finetune.LinearHead + :members: forward + +| + +.. autofunction:: zoobot.pytorch.training.finetune.load_pretrained_zoobot + +| + .. autofunction:: zoobot.pytorch.training.finetune.get_trainer | -.. autofunction:: zoobot.pytorch.training.finetune.load_pretrained_encoder +.. autofunction:: zoobot.pytorch.training.finetune.download_from_name + +| \ No newline at end of file diff --git a/docs/autodoc/shared/schemas.rst b/docs/autodoc/shared/schemas.rst index 7df8e0a9..afafe8a1 100755 --- a/docs/autodoc/shared/schemas.rst +++ b/docs/autodoc/shared/schemas.rst @@ -26,6 +26,5 @@ See :ref:`training_on_vote_counts` for full explanation. | .. autoclass:: zoobot.shared.schemas.Schema - :members: | \ No newline at end of file diff --git a/docs/autodoc/tensorflow.rst b/docs/autodoc/tensorflow.rst deleted file mode 100644 index c36b0943..00000000 --- a/docs/autodoc/tensorflow.rst +++ /dev/null @@ -1,27 +0,0 @@ -tensorflow -============= - -estimators -------------- - -.. toctree:: - - tensorflow/estimators/define_model - tensorflow/estimators/efficientnet_custom - -training -------------- - -.. toctree:: - - tensorflow/training/finetune - tensorflow/training/train_with_keras - tensorflow/training/training_config - tensorflow/training/losses - -predictions -------------- - -.. toctree:: - - tensorflow/predictions/predict_on_dataset diff --git a/docs/autodoc/tensorflow/estimators/define_model.rst b/docs/autodoc/tensorflow/estimators/define_model.rst deleted file mode 100755 index 3bbe02ed..00000000 --- a/docs/autodoc/tensorflow/estimators/define_model.rst +++ /dev/null @@ -1,17 +0,0 @@ -define_model -=================== - -This module contains functions for defining an EfficientNet model (:meth:`zoobot.estimators.define_model.get_model`), -with or without the GZ DECaLS head, and optionally to load the weights of a pretrained model. - -Models are defined using functions in ``efficientnet_standard`` and ``efficientnet_custom``. - -.. autofunction:: zoobot.tensorflow.estimators.define_model.get_model - -| - -.. autofunction:: zoobot.tensorflow.estimators.define_model.load_weights - -| - -.. autofunction:: zoobot.tensorflow.estimators.define_model.load_model diff --git a/docs/autodoc/tensorflow/estimators/efficientnet_custom.rst b/docs/autodoc/tensorflow/estimators/efficientnet_custom.rst deleted file mode 100755 index ba656134..00000000 --- a/docs/autodoc/tensorflow/estimators/efficientnet_custom.rst +++ /dev/null @@ -1,9 +0,0 @@ -efficientnet_custom -=================== - -.. autofunction:: zoobot.tensorflow.estimators.efficientnet_custom.define_headless_efficientnet - -| - -.. autofunction:: zoobot.tensorflow.estimators.efficientnet_custom.custom_top_dirichlet - diff --git a/docs/autodoc/tensorflow/predictions/predict_on_dataset.rst b/docs/autodoc/tensorflow/predictions/predict_on_dataset.rst deleted file mode 100755 index eda2c76a..00000000 --- a/docs/autodoc/tensorflow/predictions/predict_on_dataset.rst +++ /dev/null @@ -1,10 +0,0 @@ -predict_on_dataset -=================== - -This module includes utilities to make predictions with a trained model on a list of images. - -.. autofunction:: zoobot.tensorflow.predictions.predict_on_dataset.predict - -| - -.. autofunction:: zoobot.tensorflow.predictions.predict_on_dataset.paths_in_folder diff --git a/docs/autodoc/tensorflow/training/finetune.rst b/docs/autodoc/tensorflow/training/finetune.rst deleted file mode 100644 index 6d0ceee3..00000000 --- a/docs/autodoc/tensorflow/training/finetune.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _tensorflow_finetune: - -finetune -=================== - -Functions to load and adapt a trained (TensorFlow) Zoobot model to a new problem. - -:.. warning:: PyTorch is recommended for new users. See :ref:`pytorch_or_tensorflow` for more. - - -.. autofunction:: zoobot.tensorflow.training.finetune.run_finetuning diff --git a/docs/autodoc/tensorflow/training/losses.rst b/docs/autodoc/tensorflow/training/losses.rst deleted file mode 100755 index e744c44c..00000000 --- a/docs/autodoc/tensorflow/training/losses.rst +++ /dev/null @@ -1,19 +0,0 @@ -losses -=================== - -This module contains functions for calculating the custom Dirichlet-Multinomial loss used for Galaxy Zoo decision trees. - - -.. autofunction:: zoobot.tensorflow.training.losses.get_multiquestion_loss - -| - -.. autofunction:: zoobot.tensorflow.training.losses.calculate_multiquestion_loss - -| - -.. autofunction:: zoobot.tensorflow.training.losses.dirichlet_loss - -| - -.. autofunction:: zoobot.tensorflow.training.losses.get_dirichlet_neg_log_prob diff --git a/docs/autodoc/tensorflow/training/train_with_keras.rst b/docs/autodoc/tensorflow/training/train_with_keras.rst deleted file mode 100644 index 2c7026b2..00000000 --- a/docs/autodoc/tensorflow/training/train_with_keras.rst +++ /dev/null @@ -1,6 +0,0 @@ -train_with_keras -=================== - -This is the interface to train new tensorflow models from scratch. - -.. autofunction:: zoobot.tensorflow.training.train_with_keras.train diff --git a/docs/autodoc/tensorflow/training/training_config.rst b/docs/autodoc/tensorflow/training/training_config.rst deleted file mode 100755 index e12d4b69..00000000 --- a/docs/autodoc/tensorflow/training/training_config.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _training_config: - -training_config -=================== - -This module creates the :class:`Trainer` class for training a Zoobot model (itself a tf.keras.Model). -Implements common features training like early stopping and tensorboard logging. - -Follows the same idea as the PyTorch Lightning Trainer object. - -.. autoclass:: zoobot.tensorflow.training.training_config.Trainer - :members: diff --git a/docs/guides/advanced_finetuning.rst b/docs/guides/advanced_finetuning.rst index 59a59aff..6554f69c 100644 --- a/docs/guides/advanced_finetuning.rst +++ b/docs/guides/advanced_finetuning.rst @@ -4,48 +4,48 @@ Advanced Finetuning ===================== -Zoobot includes the :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotClassifier` and :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotTree` -classes to help you finetune Zoobot on classification or decision tree problems, respectively. -But what about other problems, like regression or object detection? +Zoobot includes the :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotClassifier`, :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotRegressor`, and :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotTree` +classes to help you finetune Zoobot on classification, regression, or decision tree problems, respectively. +But what about other problems, like object detection? Here's how to integrate pretrained Zoobot models into your own code. Using Zoobot's Encoder Directly ------------------------------------ -To get Zoobot's encoder, load the model and access the .encoder attribute: +To get Zoobot's encoder, load any Finetuneable class and grab the encoder attribute: .. code-block:: python - model = ZoobotTree.load_from_checkpoint(pretrained_checkpoint_loc) + model = FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano') encoder = model.encoder - model = FinetuneableZoobotClassifier.load_from_checkpoint(finetuned_checkpoint_loc) - encoder = model.encoder +or, because Zoobot encoders are `timm` models, you can just directly use `timm`: + +.. code-block:: python + + import timm + + encoder = timm.create_model('hf_hub:mwalmsley/zoobot-encoder-convnext_nano', pretrained=True, num_classes=0) - # for ZoobotTree, there's also a utility function to do this in one line - encoder = finetune.load_pretrained_encoder(pretrained_checkpoint_loc) -:class:`zoobot.pytorch.estimators.define_model.ZoobotTree`, :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotClassifier` and :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotTree` -all have ``.encoder`` and ``.head`` attributes. These are the plain PyTorch (Sequential) models used for encoding or task predictions. -The Zoobot classes simply wrap these with instructions for training, logging, checkpointing, and so on. +You can use it like any other `timm` model. For example, we did this to `add contrastive learning `_. Good luck! -You can use the encoder separately like any PyTorch Sequential for any machine learning task. We did this to `add contrastive learning `_. Go nuts. Subclassing FinetuneableZoobotAbstract --------------------------------------- -If you'd like to finetune Zoobot on a new task that isn't classification or vote counts, +If you'd like to finetune Zoobot on a new task that isn't classification, regression, or vote counts, you could instead subclass :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotAbstract`. -This is less general but avoids having to write out your own finetuning training code in e.g. PyTorch Lightning. +This lets you use our finetuning code with your own head and loss functions. -For example, to make a regression version: +Imagine there wasn't a regression version and you wanted to finetune Zoobot on a regression task. You could do: .. code-block:: python - class FinetuneableZoobotRegression(FinetuneableZoobotAbstract): + class FinetuneableZoobotCustomRegression(FinetuneableZoobotAbstract): def __init__( self, @@ -56,12 +56,12 @@ For example, to make a regression version: super().__init__(**super_kwargs) self.foo = foo - self.loss = torch.nn.MSELoss() - self.head = torch.nn.Sequential(...) + self.loss = torch.nn.SomeCrazyLoss() + self.head = torch.nn.Sequential(my_crazy_head) # see zoobot/pytorch/training/finetune.py for more examples and all methods required -You can then finetune this new class just as with e.g. FinetuneableZoobotClassifier. +You can then finetune this new class just as with e.g. :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotRegressor`. Extracting Frozen Representations @@ -71,27 +71,21 @@ Once you've finetuned to your survey, or if you're using a pretrained survey, (S the representations can be stored as frozen vectors and used as features. We use this at Galaxy Zoo to power our upcoming similary search and anomaly-finding tools. -As above, we can get Zoobot's encoder from the .encoder attribute: - -.. code-block:: python - - # can load from either ZoobotTree (if trained from scratch) - # or FinetuneableZoobotTree (if finetuned) - encoder = finetune.FinetuneableZoobotTree.load_from_checkpoint(checkpoint_loc).encoder - -``encoder`` is a PyTorch Sequential object, so we could use ``encoder.predict()`` to calculate our representations. +As above, we can get Zoobot's encoder from the .encoder attribute. We could use ``encoder()`` to calculate our representations. But then we'd have to deal with batching, looping, etc. To avoid this boilerplate, Zoobot includes a PyTorch Lightning class that lets you pass ``encoder`` to the same :func:`zoobot.pytorch.predictions.predict_on_catalog.predict` utility function used for making predictions with a full Zoobot model. .. code-block:: python + from zoobot.pytorch.training import representations + # convert to simple pytorch lightning model - model = representations.ZoobotEncoder(encoder=encoder, pyramid=False) + lightning_encoder = ZoobotEncoder.load_from_name('hf_hub:mwalmsley/zoobot-encoder-convnext_nano') predict_on_catalog.predict( catalog, - model, + lightning_encoder, n_samples=1, label_cols=label_cols, save_loc=save_loc, @@ -101,9 +95,9 @@ utility function used for making predictions with a full Zoobot model. See `zoobot/pytorch/examples/representations `_ for a full working example. -We plan on adding precalculated representations for all our DESI galaxies - but we haven't done it yet. Sorry. -Please raise an issue if you really need these. +We are sharing precalculated representations for all our DESI galaxies, and soon for HSC as well. +Check the data notes at :doc:/data_notes -The representations are typically quite high-dimensional (1280 for EfficientNetB0) and therefore highly redundant. +The representations are typically quite high-dimensional (e.g. 1280 for EfficientNetB0) and therefore highly redundant. We suggest using PCA to compress them down to a more reasonable dimension (e.g. 40) while preserving most of the information. This was our approach in the `Practical Morphology Tools paper `_. diff --git a/docs/guides/finetuning.rst b/docs/guides/finetuning.rst index 1ab59003..d46eee1b 100755 --- a/docs/guides/finetuning.rst +++ b/docs/guides/finetuning.rst @@ -30,7 +30,7 @@ Examples Zoobot includes many working examples of finetuning: -- `Google Colab notebook `__ (for binary classification in the cloud) +- `Google Colab notebook `__ (for binary classification in the cloud) - `finetune_binary_classification.py `__ (script version of the Colab notebook) - `finetune_counts_full_tree.py `__ (for finetuning on a complicated GZ-style decision tree) diff --git a/docs/guides/guides.rst b/docs/guides/guides.rst index e5ab3399..1de9e932 100755 --- a/docs/guides/guides.rst +++ b/docs/guides/guides.rst @@ -9,10 +9,8 @@ Below are some practical guides for tasks that we hope Zoobot will be helpful fo /guides/finetuning /guides/advanced_finetuning - /guides/training_on_vote_counts /guides/how_the_code_fits_together - /guides/pytorch_or_tensorflow - -If you'd prefer worked examples, you can find those under `zoobot/pytorch/examples `_ and `zoobot/tensorflow/examples `_. + /guides/loading_data + /guides/training_on_vote_counts -There's also this `Colab notebook `_ demonstrating finetuning which you can run in the cloud (with free access to a powerful GPU, courtesy of Google Research) +If you'd prefer worked examples, you can find those under `zoobot/pytorch/examples `_. diff --git a/docs/guides/how_the_code_fits_together.rst b/docs/guides/how_the_code_fits_together.rst index 6bb8109e..9c816ad5 100644 --- a/docs/guides/how_the_code_fits_together.rst +++ b/docs/guides/how_the_code_fits_together.rst @@ -6,37 +6,46 @@ How the Code Fits Together The Zoobot package has many classes and methods. This guide aims to be a map summarising how they fit together. -.. note:: For simplicity, we will only consider the PyTorch version (see :ref:`pytorch_or_tensorflow`). - -Defining PyTorch Models +The Map ------------------------- -The deep learning part is the simplest piece. -``define_model.py`` has functions to that define pure PyTorch ``nn.Modules`` (a.k.a. models). +The Zoobot package has two roles: +1. **Finetuning**: ``pytorch/training/finetune.py`` is the heart of the package. You will use these classes to load pretrained models and finetune them on new data. +2. **Training from Scratch** ``pytorch/estimators/define_model.py`` and ``pytorch/training/train_with_pytorch_lightning.py`` create and train the Zoobot models from scratch. These are *not required* for finetuning and will eventually be migrated out. -Encoders (a.k.a. models that take an image and compress it to a representation vector) are defined using the third party library ``timm``. -Specifically, ``timm.create_model(architecture_name)`` is used to get the EfficientNet, ResNet, ViT, etc. architectures used to encode our galaxy images. -This is helpful because defining complicated architectures becomes someone else's job (thanks, Ross Wightman!) +Let's zoom in on the finetuning part. -Heads (a.k.a. models that take a representation vector and make a prediction) are defined using ``torch.nn.Sequential``. -The function :func:`zoobot.pytorch.estimators.define_model.get_pytorch_dirichlet_head`, for example, returns the custom head used to predict vote counts (see :ref:`training_on_vote_counts`). +Finetuning with Zoobot Classes +-------------------------------- -The encoders and heads in ``define_model.py`` are used for both training from scratch and finetuning -Training with PyTorch Lightning --------------------------------- +There are three Zoobot classes for finetuning: +1. :class:`FinetuneableZoobotClassifier ` for classification tasks (including multi-class). +2. :class:`FinetuneableZoobotRegressor ` for regression tasks (including on a unit interval e.g. a fraction). +3. :class:`FinetuneableZoobotTree ` for training on a tree of labels (e.g. Galaxy Zoo vote counts). + +Each user-facing class is actually a subclass of a non-user-facing abstract class, :class:`FinetuneableZoobotAbstract `. +:class:`FinetuneableZoobotAbstract ` has specifying how to finetune a general PyTorch model, +which the user-facing classes inherit. + +`FinetuneableZoobotAbstract ` controls the core finetuning process: loading a model, accepting arguments controlling the finetuning process, and running the finetuning. +The user-facing class adds features specific to that type of task. For example, :class:`FinetuneableZoobotClassifier ` adds additional arguments like `num_classes`. +It also specifies an appropriate head and a loss function. + + + +Finetuning with PyTorch Lightning +----------------------------------- -PyTorch requires a lot of boilerplate code to train models, especially at scale (e.g. multi-node, multi-GPU). -We use PyTorch Lightning, a third party wrapper API, to make this boilerplate code someone else's job as well. -The core Zoobot classes you'll use - :class:`ZoobotTree `, :class:`FinetuneableZoobotClassifier ` and :class:`FinetuneableZoobotTree ` - are all "LightningModule" classes. These classes have (custom) methods like ``training_step``, ``validation_step``, etc., which specify what should happen at each training stage. -:class:`FinetuneableZoobotClassifier ` and :class:`FinetuneableZoobotTree ` -are actually subclasses of a non-user-facing abstract class, :class:`FinetuneableZoobotAbstract `. -:class:`FinetuneableZoobotAbstract ` has specifying how to finetune a general PyTorch model, -which `FinetuneableZoobotClassifier ` and :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotTree` inherit. + +Zoobot is written in PyTorch, a popular deep learning library for Python. +PyTorch requires a lot of boilerplate code to train models, especially at scale (e.g. multi-node, multi-GPU). +We use PyTorch Lightning, a third party wrapper API, to make this boilerplate code someone else's job. + :class:`ZoobotTree ` is similar to :class:`FinetuneableZoobotAbstract ` but has methods for training from scratch. @@ -66,28 +75,17 @@ Slightly confusingly, Lightning's ``Trainer`` can also be used to make predictio and that's how we make predictions with :func:`zoobot.pytorch.predictions.predict_on_catalog.predict`. -Loading Data --------------------------- - -You might notice ``datamodule`` in the examples above. -Zoobot often includes code like: - -.. code-block:: python +As you can see, there's quite a few layers (pun intended) to training Zoobot models. But we hope this setup is both simple to use and easy to extend, whichever (PyTorch) frameworks you're using. - from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule - datamodule = GalaxyDataModule( - train_catalog=train_catalog, - val_catalog=val_catalog, - test_catalog=test_catalog, - batch_size=batch_size, - # ... - ) +.. The deep learning part is the simplest piece. +.. ``define_model.py`` has functions to that define pure PyTorch ``nn.Modules`` (a.k.a. models). -Note the import - Zoobot actually doesn't have any code for loading data! -That's in the separate repository `mwalmsley/galaxy-datasets `. +.. Encoders (a.k.a. models that take an image and compress it to a representation vector) are defined using the third party library ``timm``. +.. Specifically, ``timm.create_model(architecture_name)`` is used to get the EfficientNet, ResNet, ViT, etc. architectures used to encode our galaxy images. +.. This is helpful because defining complicated architectures becomes someone else's job (thanks, Ross Wightman!) -``galaxy-datasets`` has custom code to turn catalogs of galaxies into the ``LightningDataModule``s that Lightning `expects https://pytorch-lightning.readthedocs.io/en/stable/data/datamodule.html<>`_. -These ``LightningDataModule``s themselves have attributes like ``.train_dataloader()`` and ``.predict_dataloader()`` that Lightning's ``Trainer`` object uses to demand data when training, making predictions, and so forth. +.. Heads (a.k.a. models that take a representation vector and make a prediction) are defined using ``torch.nn.Sequential``. +.. The function :func:`zoobot.pytorch.estimators.define_model.get_pytorch_dirichlet_head`, for example, returns the custom head used to predict vote counts (see :ref:`training_on_vote_counts`). -As you can see, there's quite a few layers (pun intended) to training Zoobot models. But we hope this setup is both simple to use and easy to extend, whichever (PyTorch) frameworks you're using. +.. The encoders and heads in ``define_model.py`` are used for both training from scratch and finetuning diff --git a/docs/guides/loading_data.rst b/docs/guides/loading_data.rst new file mode 100644 index 00000000..c6c74857 --- /dev/null +++ b/docs/guides/loading_data.rst @@ -0,0 +1,52 @@ + +Loading Data +-------------------------- + +Using GalaxyDataModule +========================= + +Zoobot often includes code like: + +.. code-block:: python + + from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule + + datamodule = GalaxyDataModule( + train_catalog=train_catalog, + val_catalog=val_catalog, + test_catalog=test_catalog, + batch_size=batch_size, + label_cols=['is_cool_galaxy'] + # ... + ) + +Note the import - Zoobot actually doesn't have any code for loading data! +That's in the separate repository `mwalmsley/galaxy-datasets `_. + +``galaxy-datasets`` has custom code to turn catalogs of galaxies into the ``LightningDataModule`` that Lightning `expects `_. +Each ``LightningDataModule`` has attributes like ``.train_dataloader()`` and ``.predict_dataloader()`` that Lightning's ``Trainer`` object uses to demand data when training, making predictions, and so forth. + +You can pass ``GalaxyDataModule`` train, val, test and predict catalogs. Each catalog needs the columns: + +* ``file_loc``: the path to the image file +* ``id_str``: a unique identifier for the galaxy +* plus any columns for labels, which you will specify with ``label_cols``. Setting ``label_cols=None`` will load the data without labels (returning batches of (image, id_str)). + +``GalaxyDataModule`` will load the images from disk and apply any transformations you specify. Specify transforms one of three ways: + +* through the `default arguments `_ of ``GalaxyDataModule`` (e.g. ``GalaxyDataModule(resize_after_crop=(128, 128))``) +* through a torchvision or albumentations ``Compose`` object e.g. ``GalaxyDataModule(custom_torchvision_transforms=Compose([RandomHorizontalFlip(), RandomVerticalFlip()]))`` +* through a tuple of ``Compose`` objects. The first element will be used for the train dataloaders, and the second for the other dataloaders. + +Using the default arguments is simplest and should work well for loading Galaxy-Zoo-like ``jpg`` images. Passing Compose objects offers full customization (short of writing your own ``LightningDataModule``). On that note... + +I Want To Do It Myself +======================== + +Using ``galaxy-datasets`` is optional. Zoobot is designed to work with any PyTorch ``LightningDataModule`` that returns batches of (images, labels). +And advanced users can pass data to Zoobot's encoder however they like (see :doc:`advanced_finetuning`). + +Images should be PyTorch tensors of shape (batch_size, channels, height, width). +Values should be floats normalized from 0 to 1 (though in practice, Zoobot can handle other ranges provided you use end-to-end finetuning). +If you are presenting flux values, you should apply a dynamic range rescaling like ``np.arcsinh`` before normalizing to [0, 1]. +Galaxies should appear large and centered in the image. diff --git a/docs/guides/pytorch_or_tensorflow.rst b/docs/guides/pytorch_or_tensorflow.rst deleted file mode 100644 index 9c5bb244..00000000 --- a/docs/guides/pytorch_or_tensorflow.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _pytorch_or_tensorflow: - - - -PyTorch or TensorFlow? -=========================== - -.. warning:: You should use the PyTorch version if possible. This is being actively developed and has the latest features. - -Zoobot is really two separate sets of code: `zoobot/pytorch `_ and `zoobot/tensorflow `_. -They can both train the same EfficientNet model architecture on the same Galaxy Zoo data in the same way, for extracting representations and for finetuning - but they use different underlying deep learning frameworks to do so. - -We originally created two versions of Zoobot so that astronomers can use their preferred framework. -But maintaining two almost entirely separate sets of code is too much work for our current resources (Mike's time, basically). -Going forward, the PyTorch version will be actively developed and gain new features, while the TensorFlow version will be kept up-to-date but will not otherwise improve. - -Tell Me More About What's Different -------------------------------------- - -The TensorFlow version was the original version. -It was used for the `GZ DECaLS catalog `_ and the `Practical Morphology Tools `_ paper. -You can train EfficientNetB0 and achieve the same performance as with PyTorch (see the "benchmarks folder"). -You can also finetune the trained model, although the process is slightly clunkier. - -The PyTorch version was introduced to support other researchers and to integrate with Bootstrap Your Own Latent for the `Towards Foundation Models `_ paper. -This version is actively developed and includes the latest features. - -PyTorch-specific features include: - -- Any architecture option from timm (including ResNet and Max-ViT) -- Improved interface for easy finetuning -- Layerwise learning rate decay during finetuning -- Integration with AstroAugmentations (courtesy Micah Bowles) for custom astronomy image augmentations -- Per-question loss tracking on WandB - - -Can I have a JAX version? ----------------------------- - -Only if you build it yourself. diff --git a/docs/index.rst b/docs/index.rst index a3a4cfc1..64fbcac4 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ Zoobot Documentation ==================== Zoobot makes it easy to finetune a state-of-the-art deep learning classifier to solve your galaxy morphology problem. -For example, you can finetune a classifier to find ring galaxies with `just a few hundred examples `_. +For example, you can finetune a classifier to find ring galaxies with `just a few hundred examples `_. .. figure:: finetuning_rings.png :alt: Ring galaxies found using Zoobot @@ -15,14 +15,16 @@ For example, you can finetune a classifier to find ring galaxies with `just a fe The easiest way to learn to use Zoobot is simply to use Zoobot. We suggest you start with our worked examples. -The `Colab notebook `_ is the fastest way to get started. -See the README for many scripts that you can run and adapt locally. +* This `Colab notebook `_ will walk you through using Zoobot to classify galaxy images. +* There's a similar `notebook `_ for using Zoobot for regression on galaxy images. -Guides +For more explanation, read on. + +User Guides ------------- -If you'd like more explanation and context, we've written these guides. +We've written these guides to add explanation and context. .. toctree:: :maxdepth: 2 @@ -43,24 +45,17 @@ To choose and download a pretrained model, see here. API reference -------------- -Look here for information on a specific function, class or -method. +We've added docstrings to all the key methods you might use. Feel free to check the code or reach out if you have questions. .. toctree:: - :maxdepth: 2 - - autodoc/api - + :maxdepth: 4 -.. You do not need to be a machine learning expert to use Zoobot. -.. Zoobot includes :ref:`components ` for common tasks like loading images, managing training, and making predictions. -.. You simply need to assemble these together. - -.. .. toctree:: -.. :maxdepth: 2 - -.. components/overview + autodoc/pytorch +.. different level to not expand schema too much +.. toctree:: + :maxdepth: 3 + autodoc/shared .. Indices @@ -78,6 +73,7 @@ method. .. To build: .. install sphinx https://www.sphinx-doc.org/en/master/usage/installation.html is confusing, you can just use pip install -U sphinx +.. and pip install furo .. run from in docs folder: make html .. can also check docs with diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 76f73990..c1e7402f 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -43,16 +43,16 @@ class FinetuneableZoobotAbstract(pl.LightningModule): - When provided `learning_rate` it will set the optimizer to use that learning rate. Any FinetuneableZoobot model can be loaded in one of three ways: - - HuggingFace name e.g. FinetuneableZoobotX(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. - - Any PyTorch model in memory e.g. FinetuneableZoobotX(encoder=some_model, ...) - - ZoobotTree checkpoint e.g. FinetuneableZoobotX(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + - HuggingFace name e.g. `FinetuneableZoobotX(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...)`. Recommended. + - Any PyTorch model in memory e.g. `FinetuneableZoobotX(encoder=some_model, ...)` + - ZoobotTree checkpoint e.g. `FinetuneableZoobotX(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...)` You could subclass this class to solve new finetuning tasks - see :ref:`advanced_finetuning`. Args: name (str, optional): Name of a model on HuggingFace Hub e.g.'hf_hub:mwalmsley/zoobot-encoder-convnext_nano'. Defaults to None. encoder (torch.nn.Module, optional): A PyTorch model already loaded in memory - zoobot_checkpoint_loc (str, optional): Path to ZoobotTree lightning checkpoint to load. Loads with Load with :func:`zoobot.pytorch.training.finetune.load_pretrained_encoder`. Defaults to None. + zoobot_checkpoint_loc (str, optional): Path to ZoobotTree lightning checkpoint to load. Loads with Load with :func:`zoobot.pytorch.training.finetune.load_pretrained_zoobot`. Defaults to None. n_blocks (int, optional): lr_decay (float, optional): For each layer i below the head, reduce the learning rate by lr_decay ^ i. Defaults to 0.75. @@ -174,11 +174,11 @@ def configure_optimizers(self): and then pick the top self.n_blocks to finetune weight_decay is applied to both the head and (if relevant) the encoder - learning rate decay is applied to the encoder only: lr * (lr_decay**block_n), ignoring the head (block 0) + learning rate decay is applied to the encoder only: lr x (lr_decay^block_n), ignoring the head (block 0) What counts as a "block" is a bit fuzzy, but I generally use the self.encoder.stages from timm. I also count the stem as a block. - *batch norm layers may optionally still have updated statistics using always_train_batchnorm + batch norm layers may optionally still have updated statistics using always_train_batchnorm """ lr = self.learning_rate @@ -395,10 +395,8 @@ class FinetuneableZoobotClassifier(FinetuneableZoobotAbstract): These are shared between classifier, regressor, and tree models. See the docstring of :class:``FinetuneableZoobotAbstract`` for more. - Models can be loaded in one of three ways: - - HuggingFace name e.g. FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. - - Any PyTorch model in memory e.g. FinetuneableZoobotClassifier(encoder=some_model, ...) - - ZoobotTree checkpoint e.g. FinetuneableZoobotClassifier(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + Models can be loaded with `FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...)`. + See :class:``FinetuneableZoobotAbstract`` for other loading options (e.g. in-memory models or local checkpoints). Args: num_classes (int): num. of target classes (e.g. 2 for binary classification). @@ -511,10 +509,8 @@ class FinetuneableZoobotRegressor(FinetuneableZoobotAbstract): These are shared between classifier, regressor, and tree models. See the docstring of :class:``FinetuneableZoobotAbstract`` for more. - Models can be loaded in one of three ways: - - HuggingFace name e.g. FinetuneableZoobotRegressor(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. - - Any PyTorch model in memory e.g. FinetuneableZoobotRegressor(encoder=some_model, ...) - - ZoobotTree checkpoint e.g. FinetuneableZoobotRegressor(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + Models can be loaded with `FinetuneableZoobotRegressor(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...)`. + See :class:``FinetuneableZoobotAbstract`` for other loading options (e.g. in-memory models or local checkpoints). Args: @@ -619,10 +615,8 @@ class FinetuneableZoobotTree(FinetuneableZoobotAbstract): These are shared between classifier, regressor, and tree models. See the docstring of :class:``FinetuneableZoobotAbstract`` for more. - Models can be loaded in one of three ways: - - HuggingFace name e.g. FinetuneableZoobotRegressor(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...). Recommended. - - Any PyTorch model in memory e.g. FinetuneableZoobotRegressor(encoder=some_model, ...) - - ZoobotTree checkpoint e.g. FinetuneableZoobotRegressor(zoobot_checkpoint_loc='path/to/zoobot_tree.ckpt', ...) + Models can be loaded with `FinetuneableZoobotTree(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', ...)`. + See :class:``FinetuneableZoobotAbstract`` for other loading options (e.g. in-memory models or local checkpoints). Args: schema (schemas.Schema): description of the layout of the decision tree. See :class:`zoobot.shared.schemas.Schema`. @@ -680,7 +674,15 @@ def __init__(self, input_dim: int, output_dim: int, dropout_prob=0.5, activation self.activation = activation def forward(self, x): - # returns logits, as recommended for CrossEntropy loss + """returns logits, as recommended for CrossEntropy loss + + Args: + x (torch.Tensor): encoded representation + + Returns: + torch.Tensor: result (see docstring of LinearHead) + """ + # x = self.dropout(x) x = self.linear(x) if self.activation is not None: diff --git a/zoobot/pytorch/training/train_with_pytorch_lightning.py b/zoobot/pytorch/training/train_with_pytorch_lightning.py index 9e22b508..2c9e7524 100644 --- a/zoobot/pytorch/training/train_with_pytorch_lightning.py +++ b/zoobot/pytorch/training/train_with_pytorch_lightning.py @@ -66,42 +66,48 @@ def train_default_zoobot_from_scratch( ) -> Tuple[define_model.ZoobotTree, pl.Trainer]: """ Train Zoobot from scratch on a big galaxy catalog. - Zoobot is a base deep learning model (anything from timm, typically a CNN) plus a dirichlet head. - Images are augmented using the default transforms (flips, rotations, zooms) - from `the galaxy-datasets repo `_. - Once trained, Zoobot can be finetuned to new data. - For finetuning, see zoobot/pytorch/training/finetune.py. - Many pretrained models are already available - see :ref:`datanotes`. + **You don't need to use this**. + Training from scratch is becoming increasingly complicated (as you can see from the arguments) due to ongoing research on the best methods. + This will be refactored to a dedicated "foundation" repo. Args: save_dir (str): folder to save training logs and trained model checkpoints + schema (shared.schemas.Schema): Schema object with label_cols, question_answer_pairs, and dependencies catalog (pd.DataFrame, optional): Galaxy catalog with columns `id_str` and `file_loc`. Will be automatically split to train and val (no test). Defaults to None. train_catalog (pd.DataFrame, optional): As above, but already split by you for training. Defaults to None. val_catalog (pd.DataFrame, optional): As above, for validation. Defaults to None. test_catalog (pd.DataFrame, optional): As above, for testing. Defaults to None. + train_urls (list, optional): List of URLs to webdatasets for training. Defaults to None. + val_urls (list, optional): List of URLs to webdatasets for validation. Defaults to None. + test_urls (list, optional): List of URLs to webdatasets for testing. Defaults to None. + cache_dir (str, optional): Directory to cache webdatasets. Defaults to None. epochs (int, optional): Max. number of epochs to train for. Defaults to 1000. patience (int, optional): Max. number of epochs to wait for any loss improvement before ending training. Defaults to 8. architecture_name (str, optional): Architecture to use. Passed to timm. Must be in timm.list_models(). Defaults to 'efficientnet_b0'. + timm_kwargs (dict, optional): Additional kwargs to pass to timm model init method, for example {'drop_connect_rate': 0.2}. Defaults to {}. + batch_size (int, optional): Batch size. Defaults to 128. dropout_rate (float, optional): Randomly drop activations prior to the output layer, with this probability. Defaults to 0.2. - drop_connect_rate (float, optional): Randomly drop blocks with this probability, for regularisation. For supported timm models only. Defaults to 0.2. learning_rate (float, optional): Base learning rate for AdamW. Defaults to 1e-3. betas (tuple, optional): Beta args (i.e. momentum) for adamW. Defaults to (0.9, 0.999). weight_decay (float, optional): Weight decay arg (i.e. L2 penalty) for AdamW. Defaults to 0.01. - scheduler_params (dict, optional): Specify a learning rate scheduler. See code. Not recommended with AdamW. Defaults to {}. + scheduler_params (dict, optional): Specify a learning rate scheduler. See code below. Defaults to {}. color (bool, optional): Train on RGB images rather than channel-averaged. Defaults to False. resize_after_crop (int, optional): Input image size. After all transforms, images will be resized to this size. Defaults to 224. crop_scale_bounds (tuple, optional): Off-center crop fraction (<1 means zoom in). Defaults to (0.7, 0.8). crop_ratio_bounds (tuple, optional): Aspect ratio of crop above. Defaults to (0.9, 1.1). nodes (int, optional): Multi-node training Unlikely to work on your cluster without tinkering. Defaults to 1 (i.e. one node). gpus (int, optional): Multi-GPU training. Uses distributed data parallel - essentially, full dataset is split by GPU. See torch docs. Defaults to 2. + sync_batchnorm (bool, optional): Use synchronized batchnorm. Defaults to False. num_workers (int, optional): Processes for loading data. See torch dataloader docs. Should be < num cpus. Defaults to 4. prefetch_factor (int, optional): Num. batches to queue in memory per dataloader. See torch dataloader docs. Defaults to 4. mixed_precision (bool, optional): Use (mostly) half-precision to halve memory requirements. May cause instability. See Lightning Trainer docs. Defaults to False. + compile_encoder (bool, optional): Compile the encoder with torch.compile (new in torch v2). Defaults to False. wandb_logger (pl.loggers.wandb.WandbLogger, optional): Logger to track experiments on Weights and Biases. Defaults to None. checkpoint_file_template (str, optional): formatting for checkpoint filename. See Lightning docs. Defaults to None. auto_insert_metric_name (bool, optional): escape "/" in metric names when naming checkpoints. See Lightning docs. Defaults to True. save_top_k (int, optional): Keep the k best checkpoints. See Lightning docs. Defaults to 3. + extra_callbacks (list, optional): Additional callbacks to pass to the Trainer. Defaults to None. random_state (int, optional): Seed. Defaults to 42. Returns: diff --git a/zoobot/shared/schemas.py b/zoobot/shared/schemas.py index 57ecb537..3f85dbbe 100755 --- a/zoobot/shared/schemas.py +++ b/zoobot/shared/schemas.py @@ -130,6 +130,7 @@ def set_dependencies(questions, dependencies): class Schema(): + def __init__(self, question_answer_pairs:dict, dependencies: dict): """ Relate the df label columns tor question/answer groups and to tfrecod label indices @@ -141,6 +142,23 @@ def __init__(self, question_answer_pairs:dict, dependencies: dict): - answers in between will be included: these are used to slice - df columns must be contigious by question (e.g. not smooth_yes, bar_no, smooth_no) for this to work! + The following schemas are available via the module (e.g. `from zoobot.shared.schemas import decals_dr5_ortho_schema`): + - decals_dr5_ortho_schema + - decals_dr8_ortho_schema + - decals_all_campaigns_ortho_schema + - gz2_ortho_schema + - gz_candels_ortho_schema + - gz_hubble_ortho_schema + - cosmic_dawn_ortho_schema + - cosmic_dawn_schema + - gz_rings_schema + - desi_schema + - gz_evo_v1_schema (this is the schema currently used for pretraining) + - gz_ukidss_schema + - gz_jwst_schema + + "ortho" refers to the orthogonal question suffix (-cd, -dr8, etc). + Args: question_answer_pairs (dict): e.g. {'smooth-or-featured: ['_smooth, _featured-or-disk, ...], ...} dependencies (dict): dict mapping each question (e.g. disk-edge-on) to the answer on which it depends (e.g. smooth-or-featured_featured-or-disk) From c1744d0a889120639632a933abe46ae2f5463b82 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 15:12:30 -0400 Subject: [PATCH 299/307] Continue docs pass --- docs/guides/choosing_parameters.rst | 101 ++++++++++++++++++++++++++++ docs/guides/guides.rst | 16 ----- docs/index.rst | 15 ++++- zoobot/pytorch/training/finetune.py | 8 +-- 4 files changed, 118 insertions(+), 22 deletions(-) create mode 100644 docs/guides/choosing_parameters.rst delete mode 100755 docs/guides/guides.rst diff --git a/docs/guides/choosing_parameters.rst b/docs/guides/choosing_parameters.rst new file mode 100644 index 00000000..9cd4b337 --- /dev/null +++ b/docs/guides/choosing_parameters.rst @@ -0,0 +1,101 @@ +.. _choosing_parameters: + +Choosing Parameters +===================================== + +All FinetuneableZoobot classes share a common set of parameters for controlling the finetuning process. These can have a big effect on performance. + + +Finetuning is fast and easy to experiment with, so we recommend trying different parameters to see what works best for your dataset. +This guide provides some explanation for each option. + +We list the key parameters below in rough order of importance. +See :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotAbstract` for the full list of parameters. + +``learning_rate`` +............................... + +Learning rate sets how fast the model parameters are updated during training. +Zoobot uses the adaptive optimizer ``AdamW``. +Adaptive optimizers adjust the learning rate for each parameter based on the mean and variance of the previous gradients. +This means you don't need to tune the learning rate as carefully as you would with a fixed learning rate optimizer like SGD. +We find a learning of 1e-4 is a good starting point for most tasks. + +If you find the model is not learning, you can try increasing the learning rate. +If you see the model loss is varying wildly, or the train loss decreases much faster than the validation loss (overfitting), you can try decreasing the learning rate. +Increasing ``n_blocks`` (below) often requires a lower learning rate, as the model will adjust more parameters for each batch. + + +``n_blocks`` +............................... + +Deep learning models are often divided into blocks of layers. +For example, a ResNet model might have 4 blocks, each containing a number of convolutional layers. +The ``n_blocks`` parameter specifies how many of these blocks to finetune. + +By default, ``n_blocks=0``, and so only the head is finetuned. +This is a good choice when you have a small dataset or when you want to avoid overfitting. +Finetuning only the head is sometimes called transfer learning. +It's equivalent to calculating representations with the pretrained model and then training a new one-layer model on top of those representations. + +You can experiment with increasing ``n_blocks`` to finetune more of the model. +This works best for larger datasets (typically more than 1k examples). +To finetune the full model, keep increasing ``n_blocks``; Zoobot will raise an error if you try to finetune more blocks than the model has. +Our recommended encoder, ``ConvNext``, has 5 blocks. + + +``lr_decay`` +............................... + +The common intuition for deep learning is that lower blocks (near the input) learn simple general features and higher blocks (near the output) learn more complex features specific to your task. +It is often useful to adjust the learning rate to be lower for lower blocks, which have already been pretrained to recognise simple galaxy features. + +Learning rate decay reduces the learning rate by block. +For example, with ``learning_rate=1e-4`` and ``lr_decay=0.75`` (the default): + +* The highest block has a learning rate of 1e-4 * (0.75^0) = 1e-4 +* The second-highest block has a learning rate of 1e-4 * (0.75^1) = 7.5e-5 +* The third-highest block has a learning rate of 1e-4 * (0.75^2) = 5.6e-5 + +and so on. + +Decreasing ``lr_decay`` will exponentially decrease the learning rate for lower blocks. + +In the extreme cases: + +* Setting ``learning_rate=0`` will disable learning in all blocks except the first block (0^0=1), equivalent to ``n_blocks=1``. +* Setting ``lr_decay=1`` will give all blocks the same learning rate. + +The head always uses the full learning rate. + +``weight_decay`` +............................... + +Weight decay is a regularization term that penalizes large weights. +When using Zoobot's default ``AdamW`` optimizer, it is closely related to L2 regularization, though there's some subtlety - see https://arxiv.org/abs/1711.05101. +Increasing weight decay will increase the penalty on large weights, which can help prevent overfitting, but may slow or even stop training. +By default, Zoobot uses a small weight decay of 0.05. + + +``dropout_prob`` +............................... + +Dropout is a regularization technique that randomly sets some activations to zero during training. +Similarly to weight decay, dropout can help prevent overfitting. +Zoobot uses a dropout probability of 0.5 by default. + + +``cosine_schedule`` and friends +................................. + +Gradually reduce the learning rate during training can slightly improve results by finding a better minimum near convergence. +This process is called learning rate scheduling. +Zoobot includes a cosine learning rate schedule, which reduces the learning rate according to a cosine function. + +The cosine schedule is controlled by the following parameters: + +* ``cosine_schedule`` to enable the scheduler. +* ``warmup_epochs`` to linearly increase the learning rate from 0 to ``learning_rate`` over the first ``warmup_epochs`` epochs, before applying cosine scheduling. +* ``max_cosine_epochs`` sets how many epochs it takes to decay to the final learning rate (below). Warmup epochs don't count. +* ``max_learning_rate_reduction_factor`` controls the final learning rate (``learning_rate`` * ``max_learning_rate_reduction_factor``). + \ No newline at end of file diff --git a/docs/guides/guides.rst b/docs/guides/guides.rst deleted file mode 100755 index 1de9e932..00000000 --- a/docs/guides/guides.rst +++ /dev/null @@ -1,16 +0,0 @@ - -Guides -====== - -Below are some practical guides for tasks that we hope Zoobot will be helpful for. - -.. toctree:: - :maxdepth: 2 - - /guides/finetuning - /guides/advanced_finetuning - /guides/how_the_code_fits_together - /guides/loading_data - /guides/training_on_vote_counts - -If you'd prefer worked examples, you can find those under `zoobot/pytorch/examples `_. diff --git a/docs/index.rst b/docs/index.rst index 64fbcac4..542b96b5 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,12 +24,23 @@ For more explanation, read on. User Guides ------------- -We've written these guides to add explanation and context. +These introductory guides add context to the demo Colab notebooks. + +.. toctree:: + :maxdepth: 1 + + /guides/finetuning + /guides/choosing_parameters + /guides/loading_data + /guides/training_on_vote_counts + +These advanced guides explain how to integrate Zoobot into other ML projects. .. toctree:: :maxdepth: 2 - /guides/guides + /guides/advanced_finetuning + /guides/how_the_code_fits_together Pretrained Models ------------------ diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index c1e7402f..037fc553 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -61,10 +61,10 @@ class FinetuneableZoobotAbstract(pl.LightningModule): dropout_prob (float, optional): P of dropout before final output layer. Defaults to 0.5. always_train_batchnorm (bool, optional): Temporarily deprecated. Previously, if True, do not update batchnorm stats during finetuning. Defaults to True. cosine_schedule (bool, optional): Reduce the learning rate each epoch according to a cosine schedule, after warmup_epochs. Defaults to False. - warmup_epochs (int, optional): Linearly increase the learning rate from 0 to `learning_rate` over the first `warmup_epochs` epochs, before applying cosine schedule. No effect if cosine_schedule=False. - max_cosine_epochs (int, optional): Epochs for the scheduled learning rate to decay to final learning rate (below). Warmup epochs don't count. No effect if `cosine_schedule=False`. - max_learning_rate_reduction_factor (float, optional): Set final learning rate as `learning_rate` * `max_learning_rate_reduction_factor`. No effect if `cosine_schedule=False`. - from_scratch (bool, optional): Ignore all settings above and train from scratch at `learning_rate` for all layers. Useful for a quick baseline. Defaults to False. + warmup_epochs (int, optional): Linearly increase the learning rate from 0 to ``learning_rate`` over the first ``warmup_epochs`` epochs, before applying cosine schedule. No effect if cosine_schedule=False. + max_cosine_epochs (int, optional): Epochs for the scheduled learning rate to decay to final learning rate (below). Warmup epochs don't count. No effect if ``cosine_schedule=False``. + max_learning_rate_reduction_factor (float, optional): Set final learning rate as ``learning_rate`` * ``max_learning_rate_reduction_factor``. No effect if ``cosine_schedule=False``. + from_scratch (bool, optional): Ignore all settings above and train from scratch at ``learning_rate`` for all layers. Useful for a quick baseline. Defaults to False. prog_bar (bool, optional): Print progress bar during finetuning. Defaults to True. visualize_images (bool, optional): Upload example images to WandB. Good for debugging but slow. Defaults to False. seed (int, optional): random seed to use. Defaults to 42. From e762128663b5f38a71504f2763a47e67e4a68540 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 17:57:46 -0400 Subject: [PATCH 300/307] docsing --- .readthedocs.yaml | 5 +- README.md | 4 +- docs/conf.py | 7 +- docs/guides/advanced_finetuning.rst | 2 +- docs/guides/finetuning.rst | 2 +- docs/index.rst | 16 +- .../{data_notes.rst => pretrained_models.rst} | 170 +++++++++--------- docs/science_data.rst | 4 + .../finetune_binary_classification.py | 7 +- 9 files changed, 119 insertions(+), 98 deletions(-) rename docs/{data_notes.rst => pretrained_models.rst} (50%) create mode 100644 docs/science_data.rst diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 2b64d49a..fe4c5fdb 100755 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,14 +1,13 @@ version: 2 python: - version: 3.8 + version: 3.9 install: - method: pip path: . extra_requirements: - docs - - pytorch_m1 - - tensorflow + - pytorch_cpu sphinx: fail_on_warning: true \ No newline at end of file diff --git a/README.md b/README.md index f919f978..91266461 100755 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Zoobot is trained using millions of answers by Galaxy Zoo volunteers. This code - [Install](#installation) - [Quickstart](#quickstart) - [Worked Examples](#worked-examples) -- [Pretrained Weights](https://zoobot.readthedocs.io/en/latest/data_notes.html) +- [Pretrained Weights](https://zoobot.readthedocs.io/en/latest/pretrained_models.html) - [Datasets](https://www.github.com/mwalmsley/galaxy-datasets) - [Documentation](https://zoobot.readthedocs.io/) (for understanding/reference) @@ -101,7 +101,7 @@ I suggest starting with the [Colab notebook](https://colab.research.google.com/d For context and explanation, see the [documentation](https://zoobot.readthedocs.io/). -For pretrained model weights, precalculated representations, catalogues, and so forth, see the [data notes](https://zoobot.readthedocs.io/en/latest/data_notes.html) in particular. +Pretrained models are listed [here](https://zoobot.readthedocs.io/en/latest/pretrained_models.html) and available on [HuggingFace](https://huggingface.co/collections/mwalmsley/zoobot-encoders-65fa14ae92911b173712b874) ### Worked Examples diff --git a/docs/conf.py b/docs/conf.py index 227ce95e..87c633f5 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,11 +19,11 @@ # -- Project information ----------------------------------------------------- project = 'Zoobot' -copyright = '2023, Mike Walmsley' +copyright = '2024, Mike Walmsley' author = 'Mike Walmsley' # The full version, including alpha/beta/rc tags -release = '0.0.4' +release = '2.0' # -- General configuration --------------------------------------------------- @@ -33,7 +33,8 @@ # ones. extensions = [ 'sphinx.ext.autodoc', # import docs from code - 'sphinx.ext.napoleon' # google docstrings + 'sphinx.ext.napoleon', # google docstrings + 'sphinxemoji.sphinxemoji', # emoji support https://sphinxemojicodes.readthedocs.io/en/stable/ ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/guides/advanced_finetuning.rst b/docs/guides/advanced_finetuning.rst index 6554f69c..1ef3b86a 100644 --- a/docs/guides/advanced_finetuning.rst +++ b/docs/guides/advanced_finetuning.rst @@ -96,7 +96,7 @@ utility function used for making predictions with a full Zoobot model. See `zoobot/pytorch/examples/representations `_ for a full working example. We are sharing precalculated representations for all our DESI galaxies, and soon for HSC as well. -Check the data notes at :doc:/data_notes +Check the data notes at :doc:/data_access. The representations are typically quite high-dimensional (e.g. 1280 for EfficientNetB0) and therefore highly redundant. We suggest using PCA to compress them down to a more reasonable dimension (e.g. 40) while preserving most of the information. diff --git a/docs/guides/finetuning.rst b/docs/guides/finetuning.rst index d46eee1b..122d167e 100755 --- a/docs/guides/finetuning.rst +++ b/docs/guides/finetuning.rst @@ -65,7 +65,7 @@ These files are called checkpoints (like video game save files - computer scient n_layers=0 ) -You can download a checkpoint file from :ref:`datanotes`. +You can download a checkpoint file from :ref:`pretrainedmodels`. What about the other arguments? When loading the checkpoint, FinetuneableZoobotClassifier will automatically change the head layer to suit a classification problem (hence, ``Classifier``). diff --git a/docs/index.rst b/docs/index.rst index 542b96b5..4f062cf3 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -50,7 +50,21 @@ To choose and download a pretrained model, see here. .. toctree:: :maxdepth: 2 - data_notes + pretrained_models + + +Science-Ready Data +------------------ + +You can find our science outputs (e.g. morphology catalogs, precalculated representations) here. + +.. toctree:: + :maxdepth: 2 + + science_data + +We are working on releasing the compiled GZ Evo dataset and will update this page when it is available. +Estimated public release is Q4 2024. Please reach out if you'd like early access. API reference diff --git a/docs/data_notes.rst b/docs/pretrained_models.rst similarity index 50% rename from docs/data_notes.rst rename to docs/pretrained_models.rst index e6ce0a4f..a54882ad 100755 --- a/docs/data_notes.rst +++ b/docs/pretrained_models.rst @@ -1,107 +1,101 @@ -.. _datanotes: +.. pretrainedmodels: Pretrained Models -================= +------------------ -Zoobot includes weights for the following pretrained models. +Loading Models +========================== -.. list-table:: PyTorch Models +Pretrained models are available via HuggingFace (|:hugging:|) with + +.. code-block:: python + + from zoobot.pytorch.training.finetune import FinetuneableZoobotClassifier + # or FinetuneableZoobotRegressor, or FinetuneableZoobotTree + + model = FinetuneableZoobotClassifier(name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano') + +For more options (e.g. loading the ``timm`` encoder directly) see :doc:`guides/advanced_finetuning`. + +Available Models +========================== + +Zoobot includes weights for the following pretrained models: + + +.. list-table:: :widths: 70 35 35 35 35 :header-rows: 1 * - Architecture - - Input Size - - Channels + - Parameters + - Test loss - Finetune - - Link - * - EfficientNetB0 - - 224px - - 1 + - HF |:hugging:| + * - ConvNeXT-Nano + - 15.6M + - 19.23 - Yes - - `Link `__ - * - EfficientNetB0 - - 300px - - 1 + - `Link `__ + * - ConvNeXT-Small + - 58.5M + - 19.14 - Yes - - `Link `__ - * - EfficientNetB0 - - 224px - - 3 + - `Link `__ + * - ConvNeXT-Base + - 88.6M + - **19.04** - Yes - - `Link `__ - * - ResNet50 - - 300px - - 1 + - `Link `__ + * - ConvNeXT-Large + - 197.8M + - 19.09 - Yes - - `Link `__ - * - ResNet50 - - 224px - - 1 + - `Link `__ + * - MaxViT-Small + - 64.9M + - 19.20 - Yes - - `Link `__ - * - ResNet18 - - 300px - - 1 + - `Link `__ + * - MaxViT-Base + - 124.5 + - 19.09 - Yes - - `Link `__ - * - ResNet18 - - 224px - - 1 + - TODO + * - Max-ViT-Large + - 211.8M + - 19.18 - Yes - - `Link `__ - * - Max-ViT Tiny - - 224px - - 1 + - `Link `__ + * - EfficientNetB0 + - 5.33M + - 19.48 - Yes - - `Link `__ - * - Max-ViT Tiny - - 224px - - 3 + - `Link `__ + * - EfficientNetV2-S + - 48.3M + - 19.33 - Yes - - `Link `__ - - - -.. list-table:: TensorFlow Models - :widths: 70 35 35 35 35 - :header-rows: 1 - - * - Architecture - - Input Size - - Channels - - Finetune - - Link - * - EfficientNetB0 - - 300px - - 1 + - `Link `__ + * - ResNet18 + - 11.7M + - 19.43 - Yes - - `Link `__ - * - EfficientNetB0 - - 224px - - 1 + - `Link `__ + * - ResNet50 + - 25.6M + - 19.83 - Yes - - WIP + - `Link `__ .. note:: - Missing a model you need? Reach out! There's a good chance we can train any small-ish model supported by `timm `_. - -All models are trained on the GZ Evo dataset described in the `Towards Foundation Models paper `_. -This dataset includes 550k galaxy images and 92M votes drawn from every major Galaxy Zoo campaign: GZ2, GZ Hubble, GZ CANDELS, and GZ DECaLS/DESI. - -All models are trained on the same images shown to Galaxy Zoo volunteers. -These are typically 424 pixels across. -The images are transformed using the galaxy-datasets default transforms (random off-center crop/zoom, flips, rotation) and then resized to the desired input size (224px or 300px) and, for 1-channel models, channel-averaged. - -We also include a few additional ad-hoc models `on Dropbox `_. - -- EfficientNetB0 models pretrained only on GZ DECaLS GZD-5. For reference/comparison. -- EfficientNetB0 models pretrained with smaller images (128px and 64px). For debugging. + Missing a model you need? Reach out! There's a good chance we can train any model supported by `timm `_. Which model should I use? --------------------------- +=========================== We suggest the PyTorch EfficientNetB0 224-pixel model for most users. @@ -120,8 +114,22 @@ However, the models require more memory and train/finetune slightly more slowly. You may want to start with a 224px model and experiment with "upgrading" once you're happy everything works. -What about the images? --------------------------- +All models are trained on the GZ Evo dataset described in the `Towards Foundation Models paper `_. +This dataset includes 550k galaxy images and 92M votes drawn from every major Galaxy Zoo campaign: GZ2, GZ Hubble, GZ CANDELS, and GZ DECaLS/DESI. + +All models are trained on the same images shown to Galaxy Zoo volunteers. +These are typically 424 pixels across. +The images are transformed using the galaxy-datasets default transforms (random off-center crop/zoom, flips, rotation) and then resized to the desired input size (224px or 300px) and, for 1-channel models, channel-averaged. + +We also include a few additional ad-hoc models `on Dropbox `_. + +- EfficientNetB0 models pretrained only on GZ DECaLS GZD-5. For reference/comparison. +- EfficientNetB0 models pretrained with smaller images (128px and 64px). For debugging. + + + +.. What about the images? +.. -------------------------- -You can find most of our datasets on the `galaxy-datasets repo `_. -The datasets are self-downloading and have loading functions for both PyTorch and TensorFlow. +.. You can find most of our datasets on the `galaxy-datasets repo `_. +.. The datasets are self-downloading and have loading functions for both PyTorch and TensorFlow. diff --git a/docs/science_data.rst b/docs/science_data.rst new file mode 100644 index 00000000..80722bf3 --- /dev/null +++ b/docs/science_data.rst @@ -0,0 +1,4 @@ +.. sciencedata: + +Science Data +------------- \ No newline at end of file diff --git a/zoobot/pytorch/examples/finetuning/finetune_binary_classification.py b/zoobot/pytorch/examples/finetuning/finetune_binary_classification.py index c5309e8b..4cf7efff 100644 --- a/zoobot/pytorch/examples/finetuning/finetune_binary_classification.py +++ b/zoobot/pytorch/examples/finetuning/finetune_binary_classification.py @@ -26,11 +26,6 @@ # For binary classification, the label column should have binary (0 or 1) labels for your classes # To support more complicated labels, Zoobot expects a list of columns. A list with one element works fine. - # load a pretrained checkpoint saved here - # https://www.dropbox.com/s/7ixwo59imjfz4ay/effnetb0_greyscale_224px.ckpt?dl=0 - # see https://zoobot.readthedocs.io/en/latest/data_notes.html for more options - checkpoint_loc = os.path.join(zoobot_dir, 'data/pretrained_models/pytorch/effnetb0_greyscale_224px.ckpt') - # save the finetuning results here save_dir = os.path.join(zoobot_dir, 'results/pytorch/finetune/finetune_binary_classification') @@ -47,7 +42,7 @@ model = finetune.FinetuneableZoobotClassifier( - checkpoint_loc=checkpoint_loc, + name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', num_classes=2, n_layers=0 # only updating the head weights. Set e.g. 1, 2 to finetune deeper. ) From 08a66f30556c07f587a6fbf520c229b55ff2216f Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 17:59:15 -0400 Subject: [PATCH 301/307] readthedocs --- .readthedocs.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index fe4c5fdb..a430ba1e 100755 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,7 +1,11 @@ version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.9" + python: - version: 3.9 install: - method: pip path: . From 579bcb46ef42fb70f23fa393b0ffa779d39c5141 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 18:00:12 -0400 Subject: [PATCH 302/307] m1 --- .readthedocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a430ba1e..822a10af 100755 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -11,7 +11,7 @@ python: path: . extra_requirements: - docs - - pytorch_cpu + - pytorch_m1 sphinx: fail_on_warning: true \ No newline at end of file From c9f6fa616698690ec235277c9430f17710ed43c7 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 18:20:42 -0400 Subject: [PATCH 303/307] add sphinxemoji (vital) --- docs/pretrained_models.rst | 51 ++++++++++++++------------------------ setup.py | 3 ++- 2 files changed, 21 insertions(+), 33 deletions(-) diff --git a/docs/pretrained_models.rst b/docs/pretrained_models.rst index a54882ad..89396f88 100755 --- a/docs/pretrained_models.rst +++ b/docs/pretrained_models.rst @@ -79,12 +79,12 @@ Zoobot includes weights for the following pretrained models: - `Link `__ * - ResNet18 - 11.7M - - 19.43 + - 19.83 - Yes - `Link `__ * - ResNet50 - 25.6M - - 19.83 + - 19.43 - Yes - `Link `__ @@ -97,39 +97,26 @@ Zoobot includes weights for the following pretrained models: Which model should I use? =========================== -We suggest the PyTorch EfficientNetB0 224-pixel model for most users. - -Zoobot will prioritise PyTorch going forward. For more, see here. -The TensorFlow models currently perform just as well as the PyTorch equivalents but will not benefit from any future updates. - -EfficientNetB0 is a small yet capable modern architecture. -The ResNet50 models perform slightly worse than EfficientNet, but are a very common architecture and may be useful as benchmarks or as part of other frameworks (like detectron2, for segmentation). - -It's unclear if color information improves overall performance at predicting GZ votes. -For CNNs, the change in performance is not significant. For ViT, it is measureable but small. -We suggesst including color if it is expected to be important to your specific task, such as hunting green peas. - -Larger input images (300px vs 224px) may provide a small boost in performance at predicting GZ votes. -However, the models require more memory and train/finetune slightly more slowly. -You may want to start with a 224px model and experiment with "upgrading" once you're happy everything works. - - -All models are trained on the GZ Evo dataset described in the `Towards Foundation Models paper `_. -This dataset includes 550k galaxy images and 92M votes drawn from every major Galaxy Zoo campaign: GZ2, GZ Hubble, GZ CANDELS, and GZ DECaLS/DESI. - -All models are trained on the same images shown to Galaxy Zoo volunteers. -These are typically 424 pixels across. -The images are transformed using the galaxy-datasets default transforms (random off-center crop/zoom, flips, rotation) and then resized to the desired input size (224px or 300px) and, for 1-channel models, channel-averaged. +We suggest starting with ConvNeXT-Nano for most users. +ConvNeXT-Nano performs very well while still being small enough to train on a single gaming GPU. +You will be able to experiment quickly. -We also include a few additional ad-hoc models `on Dropbox `_. +For maximum performance, you could swap ConvNeXT-Nano for ConvNeXT-Small or ConvNeXT-Base. +MaxViT-Base also performs well and includes an ingenious attention mechanism, if you're interested in that. +All these models are much larger and need cluster-grade GPUs (e.g. V100 or above). -- EfficientNetB0 models pretrained only on GZ DECaLS GZD-5. For reference/comparison. -- EfficientNetB0 models pretrained with smaller images (128px and 64px). For debugging. +Other models are included for reference or as benchmarks. +EfficientNetB0 is equivalent to the model used in the GZ DECaLS and GZ DESI papers. +ResNet18 and ResNet50 are classics of the genre and may be useful for comparison or as part of other frameworks (like detectron2, for segmentation). +How were the models trained? +=============================== -.. What about the images? -.. -------------------------- +The models were trained as part of the report `Scaling Laws for Galaxy Images `_. +This report systematically investigates how increasing labelled galaxy data and model size improves performance +and leads to adaptable models that generalise well to new tasks and new telescopes. -.. You can find most of our datasets on the `galaxy-datasets repo `_. -.. The datasets are self-downloading and have loading functions for both PyTorch and TensorFlow. +All models are trained on the GZ Evo dataset, +which includes 820k images and 100M+ volunteer votes drawn from every major Galaxy Zoo campaign: GZ2, GZ UKIDSS (unpublished), GZ Hubble, GZ CANDELS, GZ DECaLS/DESI, and GZ Cosmic Dawn (HSC, in prep.). +They learn an adaptable representation of galaxy images by training to answer every Galaxy Zoo question at once. diff --git a/setup.py b/setup.py index 9c50ad2a..535e7dc0 100755 --- a/setup.py +++ b/setup.py @@ -97,7 +97,8 @@ 'Sphinx', 'sphinxcontrib-napoleon', 'furo', - 'docutils<0.18' + 'docutils<0.18', + 'spinxemoji' ] }, install_requires=[ From 30b79cadac193787a90f552f2df9e743bc5d9abf Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 18:20:56 -0400 Subject: [PATCH 304/307] typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 535e7dc0..dde24b50 100755 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ 'sphinxcontrib-napoleon', 'furo', 'docutils<0.18', - 'spinxemoji' + 'sphinxemoji' ] }, install_requires=[ From 2f68c7eb0285cd4689dfc2055f84540c651b674d Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Wed, 3 Apr 2024 18:38:11 -0400 Subject: [PATCH 305/307] rebuild --- README.md | 15 ++++++++----- docs/guides/finetuning.rst | 2 +- docs/pretrained_models.rst | 2 +- docs/science_data.rst | 46 +++++++++++++++++++++++++++++++++++++- 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 91266461..d3b536dc 100755 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Zoobot is trained using millions of answers by Galaxy Zoo volunteers. This code - [Documentation](https://zoobot.readthedocs.io/) (for understanding/reference) ## Installation + You can retrain Zoobot in the cloud with a free GPU using this [Google Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing). To install locally, keep reading. @@ -47,13 +48,14 @@ To use a GPU, you must *already* have CUDA installed and matching the versions a I share my install steps [here](#install_cuda). GPUs are optional - Zoobot will run retrain fine on CPU, just slower. ## Quickstart + The [Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing) is the quickest way to get started. Alternatively, the minimal example below illustrates how Zoobot works. Let's say you want to find ringed galaxies and you have a small labelled dataset of 500 ringed or not-ringed galaxies. You can retrain Zoobot to find rings like so: -```python + ```python import pandas as pd from galaxy_datasets.pytorch.galaxy_datamodule import GalaxyDataModule @@ -74,11 +76,11 @@ Let's say you want to find ringed galaxies and you have a small labelled dataset # retrain to find rings trainer = finetune.get_trainer(save_dir) trainer.fit(model, datamodule) -``` + ``` Then you can make predict if new galaxies have rings: -```python + ```python from zoobot.pytorch.predictions import predict_on_catalog # csv with 'file_loc' column (path to image). Zoobot will predict the labels. @@ -90,11 +92,12 @@ Then you can make predict if new galaxies have rings: label_cols=['ring'], # only used for save_loc='/your/path/finetuned_predictions.csv' ) -``` + ``` Zoobot includes many guides and working examples - see the [Getting Started](#getting-started) section below. ## Getting Started + I suggest starting with the [Colab notebook](https://colab.research.google.com/drive/1A_-M3Sz5maQmyfW2A7rEu-g_Zi0RMGz5?usp=sharing) or the worked examples below, which you can copy and adapt. @@ -104,9 +107,11 @@ For context and explanation, see the [documentation](https://zoobot.readthedocs. Pretrained models are listed [here](https://zoobot.readthedocs.io/en/latest/pretrained_models.html) and available on [HuggingFace](https://huggingface.co/collections/mwalmsley/zoobot-encoders-65fa14ae92911b173712b874) ### Worked Examples + PyTorch (recommended): + - [pytorch/examples/finetuning/finetune_binary_classification.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/pytorch/examples/finetuning/finetune_binary_classification.py) - [pytorch/examples/finetuning/finetune_counts_full_tree.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/pytorch/examples/finetuning/finetune_counts_full_tree.py) - [pytorch/examples/representations/get_representations.py](https://github.com/mwalmsley/zoobot/blob/main/zoobot/pytorch/examples/representations/get_representations.py) @@ -119,8 +124,8 @@ I also [include](https://github.com/mwalmsley/zoobot/blob/main/benchmarks) the s When trained with a decision tree head (ZoobotTree, FinetuneableZoobotTree), Zoobot can learn from volunteer labels of varying confidence and predict posteriors for what the typical volunteer might say. Specifically, this Zoobot mode predicts the parameters for distributions, not simple class labels! For a demonstration of how to interpret these predictions, see the [gz_decals_data_release_analysis_demo.ipynb](https://github.com/mwalmsley/zoobot/blob/main/gz_decals_data_release_analysis_demo.ipynb). - ### (Optional) Install PyTorch with CUDA + *If you're not using a GPU, skip this step. Use the pytorch-cpu option in the section below.* diff --git a/docs/guides/finetuning.rst b/docs/guides/finetuning.rst index 122d167e..20e94174 100755 --- a/docs/guides/finetuning.rst +++ b/docs/guides/finetuning.rst @@ -65,7 +65,7 @@ These files are called checkpoints (like video game save files - computer scient n_layers=0 ) -You can download a checkpoint file from :ref:`pretrainedmodels`. +You can download a checkpoint file from :doc:`/pretrained_models`. What about the other arguments? When loading the checkpoint, FinetuneableZoobotClassifier will automatically change the head layer to suit a classification problem (hence, ``Classifier``). diff --git a/docs/pretrained_models.rst b/docs/pretrained_models.rst index 89396f88..84a1e34b 100755 --- a/docs/pretrained_models.rst +++ b/docs/pretrained_models.rst @@ -107,7 +107,7 @@ All these models are much larger and need cluster-grade GPUs (e.g. V100 or above Other models are included for reference or as benchmarks. EfficientNetB0 is equivalent to the model used in the GZ DECaLS and GZ DESI papers. -ResNet18 and ResNet50 are classics of the genre and may be useful for comparison or as part of other frameworks (like detectron2, for segmentation). +ResNet18 and ResNet50 are classics of the genre and may be useful for comparison or as part of other frameworks (like as an `object detection backbone `_). How were the models trained? diff --git a/docs/science_data.rst b/docs/science_data.rst index 80722bf3..7f75f462 100644 --- a/docs/science_data.rst +++ b/docs/science_data.rst @@ -1,4 +1,48 @@ .. sciencedata: Science Data -------------- \ No newline at end of file +------------- + +The goal of Zoobot is to do science. Here are some science-ready datasets created with Zoobot. + +Precalulated Representations +============================= + +.. warning:: + + New for Zoobot v2! We're really excited to see what you build. Reach out for help. + +Zoobot v2 now includes precalculated representations for galaxies in the Galaxy Zoo DESI data release. + +You could use these to power a similarity search, anomaly recommendation system, multi-modal model, +or really anything else that needs a short vector summarizing the morphology in a galaxy image. + + +.. list-table:: + :widths: 35 35 35 35 35 35 + :header-rows: 1 + + * - dr8_id + - ra + - dec + - pca_feat_0 + - pca_feat_1 + - ... + * - TODO + - TODO + - TODO + - TODO + - TODO + - ... + +``dr8_id`` is the unique identifier for the galaxy in the DESI Legacy Surveys DR8 release and can be crossmatched with the GZ DESI catalogs, below. +It is formed with ``{brickid}_{objid}`` where brickid is the unique identifier for the brick in the Legacy Surveys and objid is the unique identifier for the object in the brick. +``RA`` and ``Dec`` are in degrees. +The PCA features are the first N principal components representation (which is otherwse impractically large to work with). + +Galaxy Zoo Morphology +======================= + +Zoobot was used to create a detailed morphology catalog for every (extended, brighter than r=19) galaxy in the DESI Legacy Surveys (8.7M galaxies). + +We aim to provide both representations and an updated morphology catalog for DESI-LS DR10. From acfd97071593b9635be31727e87b6359870ca669 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 4 Apr 2024 14:04:46 -0400 Subject: [PATCH 306/307] final docs update? --- docs/guides/advanced_finetuning.rst | 70 +++++++++++----------- docs/guides/finetuning.rst | 8 +-- docs/guides/how_the_code_fits_together.rst | 2 + docs/science_data.rst | 35 +++++++---- 4 files changed, 64 insertions(+), 51 deletions(-) diff --git a/docs/guides/advanced_finetuning.rst b/docs/guides/advanced_finetuning.rst index 1ef3b86a..767703c7 100644 --- a/docs/guides/advanced_finetuning.rst +++ b/docs/guides/advanced_finetuning.rst @@ -31,37 +31,7 @@ or, because Zoobot encoders are `timm` models, you can just directly use `timm`: You can use it like any other `timm` model. For example, we did this to `add contrastive learning `_. Good luck! - - -Subclassing FinetuneableZoobotAbstract ---------------------------------------- - -If you'd like to finetune Zoobot on a new task that isn't classification, regression, or vote counts, -you could instead subclass :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotAbstract`. -This lets you use our finetuning code with your own head and loss functions. - -Imagine there wasn't a regression version and you wanted to finetune Zoobot on a regression task. You could do: - -.. code-block:: python - - - class FinetuneableZoobotCustomRegression(FinetuneableZoobotAbstract): - - def __init__( - self, - foo, - **super_kwargs - ): - - super().__init__(**super_kwargs) - - self.foo = foo - self.loss = torch.nn.SomeCrazyLoss() - self.head = torch.nn.Sequential(my_crazy_head) - - # see zoobot/pytorch/training/finetune.py for more examples and all methods required - -You can then finetune this new class just as with e.g. :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotRegressor`. +If you don't need to change the encoder and just want representations, see below. Extracting Frozen Representations @@ -71,7 +41,7 @@ Once you've finetuned to your survey, or if you're using a pretrained survey, (S the representations can be stored as frozen vectors and used as features. We use this at Galaxy Zoo to power our upcoming similary search and anomaly-finding tools. -As above, we can get Zoobot's encoder from the .encoder attribute. We could use ``encoder()`` to calculate our representations. +As above, we can get Zoobot's encoder from the .encoder attribute. We could use ``encoder.forward()`` to calculate our representations. But then we'd have to deal with batching, looping, etc. To avoid this boilerplate, Zoobot includes a PyTorch Lightning class that lets you pass ``encoder`` to the same :func:`zoobot.pytorch.predictions.predict_on_catalog.predict` utility function used for making predictions with a full Zoobot model. @@ -95,9 +65,41 @@ utility function used for making predictions with a full Zoobot model. See `zoobot/pytorch/examples/representations `_ for a full working example. -We are sharing precalculated representations for all our DESI galaxies, and soon for HSC as well. -Check the data notes at :doc:/data_access. +We have precalculated representations for all our DESI galaxies, and soon for HSC as well. +See :doc:`/science_data`. The representations are typically quite high-dimensional (e.g. 1280 for EfficientNetB0) and therefore highly redundant. We suggest using PCA to compress them down to a more reasonable dimension (e.g. 40) while preserving most of the information. This was our approach in the `Practical Morphology Tools paper `_. + + +Subclassing FinetuneableZoobotAbstract +--------------------------------------- + +If you'd like to finetune Zoobot on a new task that isn't classification, regression, or vote counts, +you could instead subclass :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotAbstract`. +This lets you use our finetuning code with your own head and loss functions. + +Imagine there wasn't a regression version and you wanted to finetune Zoobot on a regression task. You could do: + +.. code-block:: python + + + class FinetuneableZoobotCustomRegression(FinetuneableZoobotAbstract): + + def __init__( + self, + foo, + **super_kwargs + ): + + super().__init__(**super_kwargs) + + self.foo = foo + self.loss = torch.nn.SomeCrazyLoss() + self.head = torch.nn.Sequential(my_crazy_head) + + # see zoobot/pytorch/training/finetune.py for more examples and all methods required + +You can then finetune this new class just as with e.g. :class:`zoobot.pytorch.training.finetune.FinetuneableZoobotRegressor`. + diff --git a/docs/guides/finetuning.rst b/docs/guides/finetuning.rst index 20e94174..bce4fb56 100755 --- a/docs/guides/finetuning.rst +++ b/docs/guides/finetuning.rst @@ -30,12 +30,10 @@ Examples Zoobot includes many working examples of finetuning: -- `Google Colab notebook `__ (for binary classification in the cloud) +- `Google Colab notebook `__ (recommended starting point) - `finetune_binary_classification.py `__ (script version of the Colab notebook) - `finetune_counts_full_tree.py `__ (for finetuning on a complicated GZ-style decision tree) -There are also `examples `__ with the TensorFlow version of Zoobot. But this is no longer actively developed so we strongly suggest using the PyTorch version if possible. - Below, for less familiar readers, we walk through the ``finetune_binary_classification.py`` example in detail. Background @@ -60,12 +58,12 @@ These files are called checkpoints (like video game save files - computer scient .. code-block:: python model = finetune.FinetuneableZoobotClassifier( - checkpoint_loc=checkpoint_loc, # loads weights from here + name='hf_hub:mwalmsley/zoobot-encoder-convnext_nano', # which pretrained model to download num_classes=2, n_layers=0 ) -You can download a checkpoint file from :doc:`/pretrained_models`. +You can see the list of pretrained models at :doc:`/pretrained_models`. What about the other arguments? When loading the checkpoint, FinetuneableZoobotClassifier will automatically change the head layer to suit a classification problem (hence, ``Classifier``). diff --git a/docs/guides/how_the_code_fits_together.rst b/docs/guides/how_the_code_fits_together.rst index 9c816ad5..437bcbc7 100644 --- a/docs/guides/how_the_code_fits_together.rst +++ b/docs/guides/how_the_code_fits_together.rst @@ -10,6 +10,7 @@ The Map ------------------------- The Zoobot package has two roles: + 1. **Finetuning**: ``pytorch/training/finetune.py`` is the heart of the package. You will use these classes to load pretrained models and finetune them on new data. 2. **Training from Scratch** ``pytorch/estimators/define_model.py`` and ``pytorch/training/train_with_pytorch_lightning.py`` create and train the Zoobot models from scratch. These are *not required* for finetuning and will eventually be migrated out. @@ -20,6 +21,7 @@ Finetuning with Zoobot Classes There are three Zoobot classes for finetuning: + 1. :class:`FinetuneableZoobotClassifier ` for classification tasks (including multi-class). 2. :class:`FinetuneableZoobotRegressor ` for regression tasks (including on a unit interval e.g. a fraction). 3. :class:`FinetuneableZoobotTree ` for training on a tree of labels (e.g. Galaxy Zoo vote counts). diff --git a/docs/science_data.rst b/docs/science_data.rst index 7f75f462..569a3110 100644 --- a/docs/science_data.rst +++ b/docs/science_data.rst @@ -13,36 +13,47 @@ Precalulated Representations New for Zoobot v2! We're really excited to see what you build. Reach out for help. Zoobot v2 now includes precalculated representations for galaxies in the Galaxy Zoo DESI data release. +Download `here `_ (2.5GB) -You could use these to power a similarity search, anomaly recommendation system, multi-modal model, +You could use these to power a similarity search, anomaly recommendation system, the vision part of a multi-modal model, or really anything else that needs a short vector summarizing the morphology in a galaxy image. + + .. list-table:: :widths: 35 35 35 35 35 35 :header-rows: 1 - * - dr8_id + * - id_str - ra - dec - - pca_feat_0 - - pca_feat_1 + - feat_pca_0 + - feat_pca_1 - ... - * - TODO - - TODO - - TODO - - TODO - - TODO + * - 303240_2499 + - 4.021870 + - 3.512972 + - 0.257407 + - -7.414328 - ... -``dr8_id`` is the unique identifier for the galaxy in the DESI Legacy Surveys DR8 release and can be crossmatched with the GZ DESI catalogs, below. +``id_str`` is the unique identifier for the galaxy in the DESI Legacy Surveys DR8 release and can be crossmatched with the GZ DESI catalog (below) ``dr8_id`` key. It is formed with ``{brickid}_{objid}`` where brickid is the unique identifier for the brick in the Legacy Surveys and objid is the unique identifier for the object in the brick. ``RA`` and ``Dec`` are in degrees. -The PCA features are the first N principal components representation (which is otherwse impractically large to work with). +The PCA features are the first 40 principal components representation (which is otherwse impractically large to work with). + Galaxy Zoo Morphology ======================= Zoobot was used to create a detailed morphology catalog for every (extended, brighter than r=19) galaxy in the DESI Legacy Surveys (8.7M galaxies). +The catalog and schema are available from `Zenodo `_. +For new users, we suggest starting with the ``gz_desi_deep_learning_catalog_friendly.parquet`` catalog file. + +We previously used Zoobot to create a similar catalog for `DECaLS DR5 `_. +This has now been superceded by the GZ DESI catalog above (which includes the same galaxies, and many more). + +We aim to provide both representations and an updated morphology catalog for DESI-LS DR10, but we need to redownload all the images first |:neutral_face:|. -We aim to provide both representations and an updated morphology catalog for DESI-LS DR10. +Future catalogs will include morphology measurements for HSC, JWST, and Euclid galaxies (likely in that order). From e1b43c6d70d722d8edc933a410030a13d5509369 Mon Sep 17 00:00:00 2001 From: Mike Walmsley Date: Thu, 4 Apr 2024 15:03:02 -0400 Subject: [PATCH 307/307] tiny tweaks --- zoobot/pytorch/training/finetune.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zoobot/pytorch/training/finetune.py b/zoobot/pytorch/training/finetune.py index 037fc553..67ef1997 100644 --- a/zoobot/pytorch/training/finetune.py +++ b/zoobot/pytorch/training/finetune.py @@ -68,6 +68,7 @@ class FinetuneableZoobotAbstract(pl.LightningModule): prog_bar (bool, optional): Print progress bar during finetuning. Defaults to True. visualize_images (bool, optional): Upload example images to WandB. Good for debugging but slow. Defaults to False. seed (int, optional): random seed to use. Defaults to 42. + n_layers: No effect, deprecated. Use n_blocks instead. """ def __init__( @@ -104,7 +105,8 @@ def __init__( # debugging utils prog_bar=True, visualize_images=False, # upload examples to wandb, good for debugging - seed=42 + seed=42, + n_layers=None, # deprecated, no effect ): super().__init__() @@ -380,7 +382,7 @@ def upload_images_to_wandb(self, outputs, batch, batch_idx): @classmethod def load_from_name(cls, name: str, **kwargs): - downloaded_loc = download_from_name(cls.__name__, name, **kwargs) + downloaded_loc = download_from_name(cls.__name__, name) return cls.load_from_checkpoint(downloaded_loc, **kwargs) # trained on GPU, may need map_location='cpu' if you get a device error