diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 59f4c795..e75d0c7c 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -30,8 +30,8 @@ class DeepSpeedOffloadStrategy(Enum): # public API class DistributedBackend(Enum): - FSDP: str = "fsdp" - DEEPSPEED: str = "deepspeed" + FSDP = "fsdp" + DEEPSPEED = "deepspeed" # public API @@ -121,6 +121,7 @@ class DeepSpeedOptions(BaseModel): save_samples: int | None = None +# public API class DistillationConfig(BaseModel): """ Config to use when performing knowledge distillation during training. diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 48f2a156..189fdbc5 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -1086,7 +1086,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: parser.add_argument( "--weight_decay", type=float, - default=1e-3, + default=0, help="Weight decay rate for optimizers that support it.", ) parser.add_argument(