From ee50d4aaeabbec3a82628d0804b0e078b04b84d4 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 20 Dec 2022 17:56:10 +0000 Subject: [PATCH] Merged PR 27051: Add an option for completely resetting validation metrics Added `--valid-reset-all` that works as `--valid-reset-stalled` but it also resets last best saved validation metrics, which is useful for when the validation sets change for continued training. Added new regression test: https://github.com/marian-nmt/marian-regression-tests/pull/89 --- CHANGELOG.md | 1 + VERSION | 2 +- azure-pipelines.yml | 5 ++++- src/common/config_parser.cpp | 6 ++++-- src/training/scheduler.h | 13 +++++++++---- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c46df0f25..53f81397d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fused inplace-dropout in FFN layer in Transformer - `--force-decode` option for marian-decoder - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) +- `--valid-reset-all` option ### Fixed - Make concat factors not break old vector implementation diff --git a/VERSION b/VERSION index daf48f91d..2eac760f5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.14 +v1.11.15 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index faa619006..3b1bfff3f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -595,7 +595,10 @@ stages: # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-9 g++-9 + # Installing libunwind-dev fixes a bug in 2204 (the libunwind-14 and libunwind-dev conflict) + - bash: | + sudo apt-get install -y libunwind-dev + sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-9 g++-9 displayName: Install packages # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index c9ab45f81..4cc23f2ca 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -269,7 +269,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", - 2048); + 2048); cli.add("--transformer-decoder-dim-ffn", "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", 0); @@ -591,7 +591,9 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { "Multiple metrics can be specified", {"cross-entropy"}); cli.add("--valid-reset-stalled", - "Reset all stalled validation metrics when the training is restarted"); + "Reset stalled validation metrics when the training is restarted"); + cli.add("--valid-reset-all", + "Reset all validation metrics when the training is restarted"); cli.add("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 34aa18c21..30f8c8de7 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -494,12 +494,17 @@ class Scheduler : public TrainingObserver { state_->wordsDisp = 0; } - if(options_->get("valid-reset-stalled")) { + if(options_->get("valid-reset-stalled") || options_->get("valid-reset-all")) { state_->stalled = 0; state_->maxStalled = 0; for(const auto& validator : validators_) { - if(state_->validators[validator->type()]) + if(state_->validators[validator->type()]) { + // reset the number of stalled validations, e.g. when the validation set is the same state_->validators[validator->type()]["stalled"] = 0; + // reset last best results as well, e.g. when the validation set changes + if(options_->get("valid-reset-all")) + state_->validators[validator->type()]["last-best"] = validator->initScore(); + } } } @@ -512,10 +517,10 @@ class Scheduler : public TrainingObserver { if(mpi_->isMainProcess()) if(filesystem::exists(nameYaml)) yamlStr = io::InputFileStream(nameYaml).readToString(); - + if(mpi_) mpi_->bCast(yamlStr); - + loadFromString(yamlStr); }