From cadccaf1a00f4fc6cd37917593c3975e6cee6f1b Mon Sep 17 00:00:00 2001 From: Benjamin Warner Date: Tue, 12 Mar 2024 00:26:48 -0500 Subject: [PATCH] Add LR scheduler to examples --- README.md | 13 +++++++++++++ docs/gradient_release.md | 6 ++++++ docs/index.md | 13 +++++++++++++ docs/optimizer_accumulation.md | 7 +++++++ 4 files changed, 39 insertions(+) diff --git a/README.md b/README.md index be0ef27..21e7d81 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,9 @@ To use with gradient release: opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True) prepare_for_gradient_release(model, opt) +# setup a learning rate scheduler like normal +scheduler = CosineAnnealingLR(opt, ...) + # calling backward on the model will peform the optimzier step loss = model(torch.randn(20, dtype=torch.bfloat16)) loss.backward() @@ -94,6 +97,9 @@ loss.backward() # opt.step() # opt.zero_grad() +# step the learning rate scheduler like normal +scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ``` @@ -110,6 +116,9 @@ prepare_for_gradient_release(model, opt) # gradients directly into the optimizer states accumulation_steps = 4 +# setup a learning rate scheduler for gradient accumulation +scheduler = CosineAnnealingLR(opt, ...) + # use existing PyTorch dataloader for idx, batch in enumerate(dataloader): # `optimizer_accumulation=True` accumulates gradients into @@ -127,6 +136,10 @@ for idx, batch in enumerate(dataloader): # opt.step() # opt.zero_grad() + # step the learning rate scheduler after accumulating gradients + if not opt.optimizer_accumulation: + scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ``` diff --git a/docs/gradient_release.md b/docs/gradient_release.md index b02381c..7ad87e2 100644 --- a/docs/gradient_release.md +++ b/docs/gradient_release.md @@ -54,6 +54,9 @@ model = nn.Linear(20, 1, dtype=torch.bfloat16) opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True) prepare_for_gradient_release(model, opt) +# setup a learning rate scheduler like normal +scheduler = CosineAnnealingLR(opt, ...) + # calling backward on the model will peform the optimzier step loss = model(torch.randn(20, dtype=torch.bfloat16)) loss.backward() @@ -63,6 +66,9 @@ loss.backward() # opt.step() # opt.zero_grad() +# step the learning rate scheduler like normal +scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 29541c8..0dc61f7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -86,6 +86,9 @@ To use with gradient release: opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True) prepare_for_gradient_release(model, opt) +# setup a learning rate scheduler like normal +scheduler = CosineAnnealingLR(opt, ...) + # calling backward on the model will peform the optimzier step loss = model(torch.randn(20, dtype=torch.bfloat16)) loss.backward() @@ -95,6 +98,9 @@ loss.backward() # opt.step() # opt.zero_grad() +# step the learning rate scheduler like normal +scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ``` @@ -111,6 +117,9 @@ prepare_for_gradient_release(model, opt) # gradients directly into the optimizer states accumulation_steps = 4 +# setup a learning rate scheduler for gradient accumulation +scheduler = CosineAnnealingLR(opt, ...) + # use existing PyTorch dataloader for idx, batch in enumerate(dataloader): # `optimizer_accumulation=True` accumulates gradients into @@ -128,6 +137,10 @@ for idx, batch in enumerate(dataloader): # opt.step() # opt.zero_grad() + # step the learning rate scheduler after accumulating gradients + if not opt.optimizer_accumulation: + scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ``` diff --git a/docs/optimizer_accumulation.md b/docs/optimizer_accumulation.md index 815cc14..0e677b6 100644 --- a/docs/optimizer_accumulation.md +++ b/docs/optimizer_accumulation.md @@ -64,6 +64,9 @@ prepare_for_gradient_release(model, opt) # gradients directly into the optimizer states accumulation_steps = 4 +# setup a learning rate scheduler for gradient accumulation +scheduler = CosineAnnealingLR(opt, ...) + # use existing PyTorch dataloader for idx, batch in enumerate(dataloader): # `optimizer_accumulation=True` accumulates gradients into @@ -81,6 +84,10 @@ for idx, batch in enumerate(dataloader): # opt.step() # opt.zero_grad() + # step the learning rate scheduler after accumulating gradients + if not opt.optimizer_accumulation: + scheduler.step() + # optionally remove gradient release hooks when done training remove_gradient_release(model) ```