From af0194c3803c93c06d84c2a70df6011c3e09a927 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Thu, 29 Aug 2024 17:22:58 -0400 Subject: [PATCH] Update learning rate for #116 and other small error --- machine/jobs/nmt_engine_build_job.py | 2 +- machine/jobs/settings.yaml | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py index 63b4b81..c3caf72 100644 --- a/machine/jobs/nmt_engine_build_job.py +++ b/machine/jobs/nmt_engine_build_job.py @@ -74,7 +74,7 @@ def _train_model( ) as model_trainer: model_trainer.train(progress=phase_progress, check_canceled=check_canceled) model_trainer.save() - train_corpus_size = model_trainer.stats.train_corpus_size + train_corpus_size = parallel_corpus.count() return train_corpus_size, float("nan") def _batch_inference( diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml index bcbe7c7..00d9517 100644 --- a/machine/jobs/settings.yaml +++ b/machine/jobs/settings.yaml @@ -8,15 +8,17 @@ default: train_params: do_train: true optim: adamw_torch - warmup_steps: 4000 + warmup_steps: 1000 per_device_train_batch_size: 16 gradient_accumulation_steps: 4 label_smoothing_factor: 0.2 group_by_length: true gradient_checkpointing: true + lr_scheduler_type: cosine + learning_rate: 0.0002 fp16: true save_strategy: no - max_steps: 20000 + max_steps: 5000 generate_params: device: 0 num_beams: 2