merged with master

robberlang · Feb 6, 2022 · 8da539e · 8da539e
1 parent 266b931
commit 8da539e
Show file tree

Hide file tree

Showing 37 changed files with 849 additions and 340 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -6,6 +6,13 @@
 # 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file
 # 4. "More actions" > "Save"
 
+parameters:
+# Allow skipping the entire 'Build' stage
+- name: runBuilds
+  displayName: Run builds? Uncheck to run regression tests only.
+  type: boolean
+  default: true
+
 # The pipeline CI trigger is set on the branch master only and PR trigger on a
 # (non-draft) pull request to any branch
 trigger:
@@ -45,6 +52,7 @@ stages:
 
   ######################################################################
   - job: BuildWindows
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Windows
 
     strategy:
@@ -180,6 +188,7 @@ stages:
 
   ######################################################################
   - job: BuildUbuntu
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Ubuntu
     timeoutInMinutes: 90
 
@@ -237,17 +246,7 @@ stages:
           examples: true
           static: true
         ################################################################
-        # Ubuntu 16.04 supports CUDA 8+
-        "16.04 CUDA 9.2 gcc-7":
-          image: ubuntu-16.04
-          boost: true
-          cpu: true
-          gpu: true
-          cuda: 9.2
-          gcc: 7
-          unit_tests: true
-          examples: true
-          static: false
+        # Ubuntu 16.04 is no longer available on Azure-hosted machines
 
     pool:
       vmImage: $(image)
@@ -322,18 +321,17 @@ stages:
 
   ######################################################################
   - job: BuildUbuntuMinimal
-    displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
+    condition: eq(${{ parameters.runBuilds }}, true)
+    displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5
 
     pool:
-      vmImage: ubuntu-16.04
+      vmImage: ubuntu-18.04
 
     steps:
     - checkout: self
       submodules: true
 
     # The script simplifies installation of different versions of CUDA.
-    # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9.
-    # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work...
     - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
       displayName: Install CUDA
 
@@ -346,10 +344,10 @@ stages:
 
     # GCC 5 is the minimum version supported
     - bash: |
-        /usr/bin/gcc-5 --version
+        /usr/bin/gcc-7 --version
         mkdir -p build
         cd build
-        CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \
+        CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
         ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
           -DCOMPILE_CPU=on \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
@@ -368,10 +366,11 @@ stages:
 
   ######################################################################
   - job: BuildMacOS
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: macOS CPU clang
 
     pool:
-      vmImage: macos-latest
+      vmImage: macos-10.15
 
     steps:
     - checkout: self
@@ -416,6 +415,7 @@ stages:
 
   ######################################################################
   - job: BuildInstall
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Linux CPU library install
 
     pool:
@@ -580,7 +580,7 @@ stages:
 
     # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\'
     # instead of '/', which often breaks the job
-    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+    - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops'
       continueOnError: true
       displayName: Run tests
       workingDirectory: marian-prod-tests
@@ -677,7 +677,7 @@ stages:
           AWS_SECRET_SAS_TOKEN: $(blob-sas-token)
       workingDirectory: marian-prod-tests
 
-    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops'
       continueOnError: true
       displayName: Run tests
       workingDirectory: marian-prod-tests

diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
@@ -31,8 +31,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
   cli.alias("fp16", "true", [&](YAML::Node& config) {
     if(mode_ == cli::mode::training) {
       config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
-      // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
-      config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"}); 
+      // scaling factor, frequency, multiplier at increase, minium scaling factor
+      config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
     } else {
       config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
     }

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -267,10 +267,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
   cli.add<int>("--transformer-dim-ffn",
       "Size of position-wise feed-forward network (transformer)",
-      2048);
+      2048);  
+  cli.add<int>("--transformer-decoder-dim-ffn",
+      "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
+      0);
   cli.add<int>("--transformer-ffn-depth",
       "Depth of filters (transformer)",
       2);
+  cli.add<int>("--transformer-decoder-ffn-depth",
+      "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
+      0);
   cli.add<std::string>("--transformer-ffn-activation",
       "Activation between filters: swish or relu (transformer)",
       "swish");
@@ -528,15 +534,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   // mixed precision training
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision training with float16 and cost-scaling, "
-      "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
+      "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
   cli.add<std::vector<std::string>>("--precision",
       "Mixed precision training for forward/backward pass and optimizaton. "
       "Defines types for: forward/backward pass, optimization.",
       {"float32", "float32"});
   cli.add<std::vector<std::string>>("--cost-scaling",
       "Dynamic cost scaling for mixed precision training: "
-      "power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
-      ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
+      "scaling factor, frequency, multiplier, minimum factor")
+      ->implicit_val("256.f 1000 2.f 256.f");
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
       "After this many updates about 90% of the mass of the exponential average comes from these updates",
@@ -702,9 +708,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Use softmax shortlist: path first best prune");
   cli.add<std::vector<float>>("--weights",
       "Scorer weights");
-  cli.add<bool>("--output-sampling",
-     "Noise output layer with gumbel noise",
-      false);
+  cli.add<std::vector<std::string>>("--output-sampling",
+     "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
+     " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
+     ->implicit_val("full");
   cli.add<std::vector<int>>("--output-approx-knn",
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
@@ -889,6 +896,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
   if(mode_ == cli::mode::training) {
     cli.add<bool>("--shuffle-in-ram",
         "Keep shuffled corpus in RAM, do not write to temp file");
+
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 1);
+
     // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
     cli.add<size_t>("--all-caps-every",
         "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
@@ -907,6 +918,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
     cli.add<bool>("--mini-batch-round-up",
         "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
         true);
+  } else {
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 1);
   }
   // clang-format on
 }

diff --git a/src/common/definitions.h b/src/common/definitions.h
@@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
 /** @brief Creates shared_ptr of any type, passes all arguments to any available
  * constructor */
 template <class T, typename... Args>
-Ptr<T> New(Args&&... args) {
-  return Ptr<T>(new T(std::forward<Args>(args)...));
+inline Ptr<T> New(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
 }
 
 template <class T>
-Ptr<T> New(Ptr<T> p) {
+inline Ptr<T> New(Ptr<T> p) {
   return Ptr<T>(p);
 }
 
 /** @brief Creates InstrusivePtr of any type, passes all arguments to any available
  * constructor */
 template <class T, typename... Args>
-IPtr<T> INew(Args&&... args) {
+inline IPtr<T> INew(Args&&... args) {
   return IPtr<T>(new T(std::forward<Args>(args)...));
 }
 
 template <class T>
-IPtr<T> INew(Ptr<T> p) {
+inline IPtr<T> INew(Ptr<T> p) {
   return IPtr<T>(p);
 }
 

diff --git a/src/common/utils.cpp b/src/common/utils.cpp
@@ -70,22 +70,20 @@ void split(const std::string& line,
 // the function guarantees that the output has as many elements as requested
 void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
   fields.clear();
+  fields.resize(numFields); // make sure there is as many elements as requested
 
   size_t begin = 0;
   size_t pos = 0;
   for(size_t i = 0; i < numFields; ++i) {
     pos = line.find('\t', begin);
     if(pos == std::string::npos) {
-      fields.push_back(line.substr(begin));
+      fields[i] = line.substr(begin);
       break;
     }
-    fields.push_back(line.substr(begin, pos - begin));
+    fields[i] = line.substr(begin, pos - begin);
     begin = pos + 1;
   }
 
-  if(fields.size() < numFields)  // make sure there is as many elements as requested
-    fields.resize(numFields);
-
   ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
 }
 

diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
@@ -2,6 +2,7 @@
 
 #include "common/options.h"
 #include "common/signal_handling.h"
+#include "common/timer.h"
 #include "data/batch_stats.h"
 #include "data/rng_engine.h"
 #include "training/training_state.h"
@@ -92,6 +93,8 @@ class BatchGenerator : public RNGEngine {
 
   // this runs on a bg thread; sequencing is handled by caller, but locking is done in here
   std::deque<BatchPtr> fetchBatches() {
+    timer::Timer total;
+
     typedef typename Sample::value_type Item;
     auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content
 
@@ -135,19 +138,29 @@ class BatchGenerator : public RNGEngine {
       if(current_ != data_->end())
         ++current_;
     }
-    size_t sets = 0;
-    while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
+
+    Samples maxiBatchTemp;
+    while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data
       if (saveAndExitRequested()) // stop generating batches
         return std::deque<BatchPtr>();
-      maxiBatch->push(*current_);
-      sets = current_->size();
+
+      maxiBatchTemp.push_back(*current_);
+
       // do not consume more than required for the maxi batch as this causes
       // that line-by-line translation is delayed by one sentence
-      bool last = maxiBatch->size() == maxSize;
+      bool last = maxiBatchTemp.size() == maxSize;
       if(!last)
         ++current_; // this actually reads the next line and pre-processes it
     }
-    size_t numSentencesRead = maxiBatch->size();
+    size_t numSentencesRead = maxiBatchTemp.size();
+
+    size_t sets = 0;
+    for(auto&& s : maxiBatchTemp) {
+      if(!s.empty()) {
+        sets = s.size();
+        maxiBatch->push(s);
+      }
+    }
 
     // construct the actual batches and place them in the queue
     Samples batchVector;
@@ -163,6 +176,7 @@ class BatchGenerator : public RNGEngine {
     BatchStats::const_iterator cachedStatsIter;
     if (stats_)
       cachedStatsIter = stats_->begin();
+
     while(!maxiBatch->empty()) { // while there are sentences in the queue
       if (saveAndExitRequested()) // stop generating batches
         return std::deque<BatchPtr>();
@@ -178,12 +192,7 @@ class BatchGenerator : public RNGEngine {
             lengths[i] = batchVector.back()[i].size(); // record max lengths so far
 
         maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter);
-        // this optimization makes no difference indeed
-#if 0   // sanity check: would we find the same entry if searching from the start?
-        auto it = stats_->lower_bound(lengths);
-        auto maxBatchSize1 = stats_->findBatchSize(lengths, it);
-        ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked");
-#endif
+
         makeBatch = batchVector.size() >= maxBatchSize;
         // if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch
         if(batchVector.size() > maxBatchSize) {
@@ -231,6 +240,8 @@ class BatchGenerator : public RNGEngine {
     LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
         tempBatches.size(), numSentencesRead,
         (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
+    LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(),  (double)numSentencesRead / total.elapsed());
+
     return tempBatches;
   }