merge with internal master

marian-nmt · Feb 11, 2022 · b0275e7 · b0275e7
2 parents 8fd553e + 4b51dcb
commit b0275e7
Show file tree

Hide file tree

Showing 23 changed files with 418 additions and 220 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
 
 ### Changed
+- Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce.
+- Changed minimal C++ standard to C++-17
+- Faster LSH top-k search on CPU
 
 ## [1.11.0] - 2022-02-08
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,7 +6,7 @@ if (POLICY CMP0074)
 endif ()
 
 project(marian CXX C)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
 
@@ -91,10 +91,11 @@ if(MSVC)
   # C4310: cast truncates constant value
   # C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier
   # C4702: unreachable code; note it is also disabled globally in the VS project file
+  # C4996: warning STL4015: The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17
   if(USE_SENTENCEPIECE)
-    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4100\"")
+    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\" /wd\"4100\"")
   else()
-    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"")
+    set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\"")
   endif()
 
   # set(INTRINSICS "/arch:AVX")

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-v1.11.1
+v1.11.3
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -192,6 +192,9 @@ stages:
     displayName: Ubuntu
     timeoutInMinutes: 90
 
+    # Minimal tested configurations for marian-dev v1.11 and C++17:
+    # * Ubuntu 16.04, GCC 7.5, CMake 3.10.2, CUDA 9.2 (probably GCC 6 would work too)
+    # * Ubuntu 18.04, GCC 7.5, CMake 3.12.2, CUDA 10.0
     strategy:
       matrix:
         ################################################################
@@ -319,51 +322,6 @@ stages:
       displayName: Print versions
       workingDirectory: build
 
-  ######################################################################
-  - job: BuildUbuntuMinimal
-    condition: eq(${{ parameters.runBuilds }}, true)
-    displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5
-
-    pool:
-      vmImage: ubuntu-18.04
-
-    steps:
-    - checkout: self
-      submodules: true
-
-    # The script simplifies installation of different versions of CUDA.
-    - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
-      displayName: Install CUDA
-
-    # CMake 3.5.1 is the minimum version supported
-    - bash: |
-        wget -nv https://cmake.org/files/v3.5/cmake-3.5.1-Linux-x86_64.tar.gz
-        tar zxf cmake-3.5.1-Linux-x86_64.tar.gz
-        ./cmake-3.5.1-Linux-x86_64/bin/cmake --version
-      displayName: Download CMake
-
-    # GCC 5 is the minimum version supported
-    - bash: |
-        /usr/bin/gcc-7 --version
-        mkdir -p build
-        cd build
-        CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
-        ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
-          -DCOMPILE_CPU=on \
-          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
-      displayName: Configure CMake
-
-    - bash: make -j3
-      displayName: Compile
-      workingDirectory: build
-
-    - bash: |
-        ./marian --version
-        ./marian-decoder --version
-        ./marian-scorer --version
-      displayName: Print versions
-      workingDirectory: build
-
   ######################################################################
   - job: BuildMacOS
     condition: eq(${{ parameters.runBuilds }}, true)

diff --git a/examples b/examples
diff --git a/regression-tests b/regression-tests
diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh
@@ -60,6 +60,13 @@ CUDA_PACKAGES_IN=(
 
 CUDA_PACKAGES=""
 for package in "${CUDA_PACKAGES_IN[@]}"; do
+    # @todo This is not perfect. Should probably provide a separate list for diff versions
+    # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y
+    if [[ "${package}" == "nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+        package="compiler"
+    elif [[ "${package}" == "compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then
+        package="nvcc"
+    fi
     # Build the full package name and append to the string.
     CUDA_PACKAGES+=" cuda-${package}-${CUDA_MAJOR}-${CUDA_MINOR}"
 done
@@ -72,8 +79,8 @@ echo "CUDA_PACKAGES ${CUDA_PACKAGES}"
 
 PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin"
 PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/${PIN_FILENAME}"
-APT_KEY_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
-REPO_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/"
+APT_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
+REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/"
 
 echo "PIN_FILENAME ${PIN_FILENAME}"
 echo "PIN_URL ${PIN_URL}"

diff --git a/src/3rd_party/half_float/umHalf.inl b/src/3rd_party/half_float/umHalf.inl
@@ -344,7 +344,7 @@ inline HalfFloat operator+ (HalfFloat one, HalfFloat two)
 
 	// compute the difference between the two exponents. shifts with negative
 	// numbers are undefined, thus we need two code paths
-	register int expDiff = one.IEEE.Exp - two.IEEE.Exp;
+	/*register*/ int expDiff = one.IEEE.Exp - two.IEEE.Exp;
 
 	if (0 == expDiff)
 	{

diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp
@@ -86,11 +86,17 @@ int main(int argc, char** argv) {
     graph->setDevice(CPU0);
     graph->load(modelFrom);
 
+    std::vector<lsh::ParamConvInfo> toBeLSHed;
     if(addLsh) {
       // Add dummy parameters for the LSH before the model gets actually initialized.
       // This create the parameters with useless values in the tensors, but it gives us the memory we need.
+      toBeLSHed = {
+        {lshOutputWeights, "lsh_output_codes", "lsh_output_rotation", lshNBits}
+      };
+
       graph->setReloaded(false);
-      lsh::addDummyParameters(graph, /*weights=*/lshOutputWeights, /*nBits=*/lshNBits);
+      for(auto p : toBeLSHed)
+        lsh::addDummyParameters(graph, /*paramInfo=*/p);
       graph->setReloaded(true);
     }
 
@@ -99,7 +105,8 @@ int main(int argc, char** argv) {
     if(addLsh) {
       // After initialization, hijack the paramters for the LSH and force-overwrite with correct values.
       // Once this is done we can just pack and save as normal.
-      lsh::overwriteDummyParameters(graph, /*weights=*/lshOutputWeights);
+      for(auto p : toBeLSHed)
+        lsh::overwriteDummyParameters(graph, /*paramInfo=*/p);
     }
 
     // added a flag if the weights needs to be packed or not

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -510,7 +510,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "none");
   cli.add<std::string>("--guided-alignment-cost",
      "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)",
-     "mse");
+     "ce");
   cli.add<double>("--guided-alignment-weight",
      "Weight for guided alignment cost",
      0.1);

diff --git a/src/data/alignment.cpp b/src/data/alignment.cpp
@@ -2,6 +2,8 @@
 #include "common/utils.h"
 
 #include <algorithm>
+#include <cmath>
+#include <set>
 
 namespace marian {
 namespace data {
@@ -10,10 +12,11 @@ WordAlignment::WordAlignment() {}
 
 WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}
 
-WordAlignment::WordAlignment(const std::string& line) {
+WordAlignment::WordAlignment(const std::string& line, size_t srcEosPos, size_t tgtEosPos) {
   std::vector<std::string> atok = utils::splitAny(line, " -");
   for(size_t i = 0; i < atok.size(); i += 2)
-    data_.emplace_back(Point{ (size_t)std::stoi(atok[i]), (size_t)std::stoi(atok[i + 1]), 1.f });
+    data_.push_back(Point{ (size_t)std::stoi(atok[i]), (size_t)std::stoi(atok[i + 1]), 1.f });
+  data_.push_back(Point{ srcEosPos, tgtEosPos, 1.f }); // add alignment point for both EOS symbols
 }
 
 void WordAlignment::sort() {
@@ -22,6 +25,35 @@ void WordAlignment::sort() {
   });
 }
 
+void WordAlignment::normalize(bool reverse/*=false*/) {
+  std::vector<size_t> counts;
+  counts.reserve(data_.size());
+
+  // reverse==false : normalize target word prob by number of source words
+  // reverse==true  : normalize source word prob by number of target words
+  auto srcOrTgt = [](const Point& p, bool reverse) {
+    return reverse ? p.srcPos : p.tgtPos;
+  };
+
+  for(const auto& a : data_) {
+    size_t pos = srcOrTgt(a, reverse);
+    if(counts.size() <= pos)
+      counts.resize(pos + 1, 0);
+    counts[pos]++;
+  }
+
+  // a.prob at this point is either 1 or normalized to a different value,
+  // but we just set it to 1 / count, so multiple calls result in re-normalization
+  // regardless of forward or reverse direction. We also set the remaining values to 1.
+  for(auto& a : data_) {
+    size_t pos = srcOrTgt(a, reverse);
+    if(counts[pos] > 1)
+      a.prob = 1.f / counts[pos];
+    else 
+      a.prob = 1.f;
+  }
+}
+
 std::string WordAlignment::toString() const {
   std::stringstream str;
   for(auto p = begin(); p != end(); ++p) {
@@ -32,7 +64,7 @@ std::string WordAlignment::toString() const {
   return str.str();
 }
 
-WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft,
+WordAlignment ConvertSoftAlignToHardAlign(const SoftAlignment& alignSoft,
                                           float threshold /*= 1.f*/) {
   WordAlignment align;
   // Alignments by maximum value
@@ -58,7 +90,6 @@ WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft,
       }
     }
   }
-
   // Sort alignment pairs in ascending order
   align.sort();
 

diff --git a/src/data/alignment.h b/src/data/alignment.h
@@ -1,20 +1,22 @@
 #pragma once
 
 #include <sstream>
+#include <tuple>
 #include <vector>
 
 namespace marian {
 namespace data {
 
 class WordAlignment {
-  struct Point
-  {
+public:
+  struct Point {
       size_t srcPos;
       size_t tgtPos;
       float prob;
   };
 private:
   std::vector<Point> data_;
+
 public:
   WordAlignment();
 
@@ -28,11 +30,14 @@ class WordAlignment {
 public:
 
   /**
-   * @brief Constructs word alignments from textual representation.
+   * @brief Constructs word alignments from textual representation. Adds alignment point for externally
+   * supplied EOS positions in source and target string.
    *
    * @param line String in the form of "0-0 1-1 1-2", etc.
    */
-  WordAlignment(const std::string& line);
+  WordAlignment(const std::string& line, size_t srcEosPos, size_t tgtEosPos);
+
+  Point& operator[](size_t i) { return data_[i]; }
 
   auto begin() const -> decltype(data_.begin()) { return data_.begin(); }
   auto end()   const -> decltype(data_.end())   { return data_.end(); }
@@ -46,6 +51,12 @@ class WordAlignment {
    */
   void sort();
 
+  /**
+   * @brief Normalizes alignment probabilities of target words to sum to 1 over source words alignments.
+   * This is needed for correct cost computation for guided alignment training with CE cost criterion. 
+   */
+  void normalize(bool reverse=false);
+
   /**
    * @brief Returns textual representation.
    */
@@ -56,7 +67,7 @@ class WordAlignment {
 // Also used on QuickSAND boundary where beam and batch size is 1. Then it is simply [t][s] -> P(s|t)
 typedef std::vector<std::vector<float>> SoftAlignment; // [trg pos][beam depth * max src length * batch size]
 
-WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft,
+WordAlignment ConvertSoftAlignToHardAlign(const SoftAlignment& alignSoft,
                                           float threshold = 1.f);
 
 std::string SoftAlignToString(SoftAlignment align);

diff --git a/src/data/batch.h b/src/data/batch.h
@@ -24,7 +24,7 @@ class Batch {
   const std::vector<size_t>& getSentenceIds() const { return sentenceIds_; }
   void setSentenceIds(const std::vector<size_t>& ids) { sentenceIds_ = ids; }
 
-  virtual void setGuidedAlignment(std::vector<float>&&) = 0;
+  virtual void setGuidedAlignment(std::vector<WordAlignment>&&) = 0;
   virtual void setDataWeights(const std::vector<float>&) = 0;
   virtual ~Batch() {};
 protected:

diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
@@ -132,14 +132,13 @@ SentenceTuple Corpus::next() {
             tup.markAltered();
           addWordsToSentenceTuple(fields[i], vocabId, tup);
         }
-
-        // weights are added last to the sentence tuple, because this runs a validation that needs
-        // length of the target sequence
-        if(alignFileIdx_ > -1)
-          addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
-        if(weightFileIdx_ > -1)
-          addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
       }
+      // weights are added last to the sentence tuple, because this runs a validation that needs
+      // length of the target sequence
+      if(alignFileIdx_ > -1)
+        addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
+      if(weightFileIdx_ > -1)
+        addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
 
       // check if all streams are valid, that is, non-empty and no longer than maximum allowed length
       if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {

diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
@@ -429,11 +429,13 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
 
 void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
                                              SentenceTupleImpl& tup) const {
-  ABORT_IF(rightLeft_,
-           "Guided alignment and right-left model cannot be used "
-           "together at the moment");
+  ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used together at the moment");
+  ABORT_IF(tup.size() != 2, "Using alignment between source and target, but sentence tuple has {} elements??", tup.size());
 
-  auto align = WordAlignment(line);
+  size_t srcEosPos = tup[0].size() - 1;
+  size_t tgtEosPos = tup[1].size() - 1;
+
+  auto align = WordAlignment(line, srcEosPos, tgtEosPos);  
   tup.setAlignment(align);
 }
 
@@ -457,22 +459,17 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl
 
 void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,
                                       const std::vector<Sample>& batchVector) {
-  int srcWords = (int)batch->front()->batchWidth();
-  int trgWords = (int)batch->back()->batchWidth();
+  std::vector<WordAlignment> aligns;
+
   int dimBatch = (int)batch->getSentenceIds().size();
-
-  std::vector<float> aligns(srcWords * dimBatch * trgWords, 0.f);
-
+  aligns.reserve(dimBatch);
+
   for(int b = 0; b < dimBatch; ++b) {
-
     // If the batch vector is altered within marian by, for example, case augmentation,
     // the guided alignments we received for this tuple cease to be valid.
     // Hence skip setting alignments for that sentence tuple..
     if (!batchVector[b].isAltered()) {
-      for(auto p : batchVector[b].getAlignment()) {
-        size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos;
-        aligns[idx] = 1.f;
-      }
+      aligns.push_back(std::move(batchVector[b].getAlignment()));
     }
   }
   batch->setGuidedAlignment(std::move(aligns));
+4 −7		README.md
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_shuffle.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_shuffle_in_ram.sh
+3 −2		tests/interface/input-tsv/test_tsv_train_with_align_stdin.sh
+25 −25		tests/interface/input-tsv/train_align.expected
+25 −25		tests/interface/input-tsv/train_align_shuffle.expected
+7 −7		tests/interface/input-tsv/train_align_stdin.expected
+15 −15		tests/interface/input-tsv/train_align_weights.expected
+10 −10		tests/training/features/guided-alignment/rnn.expected
+1 −1		tests/training/features/guided-alignment/test_guided_alignment_rnn.sh
+2 −2		tests/training/features/guided-alignment/test_guided_alignment_transformer.sh
+2 −2		tests/training/features/guided-alignment/test_guided_alignment_transformer_sync.sh
+10 −0		tests/training/features/guided-alignment/transformer.expected