Skip to content

Commit

Permalink
Add LR scheduler to examples
Browse files Browse the repository at this point in the history
  • Loading branch information
warner-benjamin committed Mar 12, 2024
1 parent 41452f5 commit cadccaf
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 0 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ To use with gradient release:
opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True)
prepare_for_gradient_release(model, opt)

# setup a learning rate scheduler like normal
scheduler = CosineAnnealingLR(opt, ...)

# calling backward on the model will peform the optimzier step
loss = model(torch.randn(20, dtype=torch.bfloat16))
loss.backward()
Expand All @@ -94,6 +97,9 @@ loss.backward()
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler like normal
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
Expand All @@ -110,6 +116,9 @@ prepare_for_gradient_release(model, opt)
# gradients directly into the optimizer states
accumulation_steps = 4

# setup a learning rate scheduler for gradient accumulation
scheduler = CosineAnnealingLR(opt, ...)

# use existing PyTorch dataloader
for idx, batch in enumerate(dataloader):
# `optimizer_accumulation=True` accumulates gradients into
Expand All @@ -127,6 +136,10 @@ for idx, batch in enumerate(dataloader):
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler after accumulating gradients
if not opt.optimizer_accumulation:
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
Expand Down
6 changes: 6 additions & 0 deletions docs/gradient_release.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ model = nn.Linear(20, 1, dtype=torch.bfloat16)
opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True)
prepare_for_gradient_release(model, opt)

# setup a learning rate scheduler like normal
scheduler = CosineAnnealingLR(opt, ...)

# calling backward on the model will peform the optimzier step
loss = model(torch.randn(20, dtype=torch.bfloat16))
loss.backward()
Expand All @@ -63,6 +66,9 @@ loss.backward()
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler like normal
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
13 changes: 13 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ To use with gradient release:
opt = AdamW(model.parameters(), lr=1e-3, gradient_release=True)
prepare_for_gradient_release(model, opt)

# setup a learning rate scheduler like normal
scheduler = CosineAnnealingLR(opt, ...)

# calling backward on the model will peform the optimzier step
loss = model(torch.randn(20, dtype=torch.bfloat16))
loss.backward()
Expand All @@ -95,6 +98,9 @@ loss.backward()
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler like normal
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
Expand All @@ -111,6 +117,9 @@ prepare_for_gradient_release(model, opt)
# gradients directly into the optimizer states
accumulation_steps = 4

# setup a learning rate scheduler for gradient accumulation
scheduler = CosineAnnealingLR(opt, ...)

# use existing PyTorch dataloader
for idx, batch in enumerate(dataloader):
# `optimizer_accumulation=True` accumulates gradients into
Expand All @@ -128,6 +137,10 @@ for idx, batch in enumerate(dataloader):
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler after accumulating gradients
if not opt.optimizer_accumulation:
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
Expand Down
7 changes: 7 additions & 0 deletions docs/optimizer_accumulation.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ prepare_for_gradient_release(model, opt)
# gradients directly into the optimizer states
accumulation_steps = 4

# setup a learning rate scheduler for gradient accumulation
scheduler = CosineAnnealingLR(opt, ...)

# use existing PyTorch dataloader
for idx, batch in enumerate(dataloader):
# `optimizer_accumulation=True` accumulates gradients into
Expand All @@ -81,6 +84,10 @@ for idx, batch in enumerate(dataloader):
# opt.step()
# opt.zero_grad()

# step the learning rate scheduler after accumulating gradients
if not opt.optimizer_accumulation:
scheduler.step()

# optionally remove gradient release hooks when done training
remove_gradient_release(model)
```
Expand Down

0 comments on commit cadccaf

Please sign in to comment.