From c29cc83dc4d234f0e3a00a46a729053132b408b8 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 30 Mar 2021 08:58:11 +0000 Subject: [PATCH 001/254] Update submodule examples --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index c19b7814d..6d5921cc7 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit c19b7814d71febf1053bd93af6ac314b46204092 +Subproject commit 6d5921cc7de91f4e915b59e9c52c9a76c4e99b00 From 4408e88a941b60e23ddb12e3afcf599ec0f42e9b Mon Sep 17 00:00:00 2001 From: Rohit Jain Date: Tue, 30 Mar 2021 21:43:06 +0000 Subject: [PATCH 002/254] Merged PR 18366: Fix generation of special control characters for default vocabulary This PR extends the --allow-special feature to default vocabulary items as well. If the default vocabulary is provided with symbols ostensibly generated from the SentencePiece Byte Fallback mechanism, we suppress the control characters from that list. --- CHANGELOG.md | 1 + src/data/default_vocab.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56ede4e55..cc9f179cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Extend suppression of unwanted output symbols, specifically "\n" from default vocabulary if generated by SentencePiece with byte-fallback. Deactivates with --allow-special - Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special - Display decoder time statistics with marian-decoder --stat-freq 10 ... - Support for MS-internal binary shortlist diff --git a/src/data/default_vocab.cpp b/src/data/default_vocab.cpp index 7706a1c11..2d92f4f64 100644 --- a/src/data/default_vocab.cpp +++ b/src/data/default_vocab.cpp @@ -28,6 +28,9 @@ class DefaultVocab : public IVocab { std::vector suffixes_ = { ".yml", ".yaml", ".json" }; + // Contains control characters added to vocab, possibly due to byte-fallback + std::vector controlChars_; + class VocabFreqOrderer { private: const std::unordered_map& counter_; @@ -71,6 +74,16 @@ class DefaultVocab : public IVocab { return decode(sentence, /*ignoreEOS=*/true); } + // SentencePiece with byte-fallback may generate control symbols with output sampling. + // Let's mark them as special and suppress them later on output. This is generally safe + // for UTF-8 since control chars are not used as partial bytes in multi-byte sequences. + // They only appear in single-byte chars as themselves and this is what we suppress. + void addSpecialWords(std::vector& special) const override { + special.reserve(special.size() + controlChars_.size()); + for(auto c : controlChars_) + special.push_back(c); + } + virtual std::string type() const override { return "DefaultVocab"; } virtual Word getEosId() const override { return eosId_; } @@ -130,6 +143,8 @@ class DefaultVocab : public IVocab { } ABORT_IF(id2str_.empty(), "Empty vocabulary: ", vocabPath); + populateControlChars(); + addRequiredVocabulary(vocabPath, isJson); return std::max(id2str_.size(), maxSize); @@ -172,6 +187,17 @@ class DefaultVocab : public IVocab { private: + // Creates the first 32 control characters as done in byte-fallback and checks if they exist in the vocab. + // This makes sure that we do not waste computational effort on suppression if they don't actually appear. + void populateControlChars() { + for(int i = 0; i < 32; ++i) { + std::string bytePiece = fmt::format("<0x{:02X}>", i); // 0 becomes <0x00>, 10 becomes <0x0A>, note uppercase A and lowercase x + auto id = (*this)[bytePiece]; + if(id != unkId_) + controlChars_.push_back(id); + } + } + virtual void addRequiredVocabulary(const std::string& vocabPath, bool isJson) { // look up ids for and , which are required // The name backCompatStr is alternatively accepted for Yaml vocabs if id From bfa6180033307f579f91d4220971474d0c3de86d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 8 Apr 2021 07:30:38 +0000 Subject: [PATCH 003/254] Revert "remove TC_MALLOC from optional dependencies (#840)" This reverts commit 096c48e51cd2e61bb275345d7cca99cbfd6bc5c7. --- CHANGELOG.md | 1 - CMakeLists.txt | 12 ++++++++++++ VERSION | 1 - src/3rd_party/CMakeLists.txt | 8 ++++++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56ede4e55..363244531 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Changed - Moved FBGEMM pointer to commit c258054 for gcc 9.3+ fix -- Remove TC_MALLOC as an optional build depdendency. Doesn't seem to actually do anything anymore. - Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL, -DCOMPILE_PASCAL, -DCOMPILE_VOLTA, -DCOMPILE_TURING and -DCOMPILE_AMPERE - Disable -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL by default. diff --git a/CMakeLists.txt b/CMakeLists.txt index dffbd1ff2..4ee339781 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -420,6 +420,18 @@ if(USE_STATIC_LIBS) endif() endif() +############################################################################### +# Find Tcmalloc +if(NOT WIN32) + find_package(Tcmalloc) + if(Tcmalloc_FOUND) + include_directories(${Tcmalloc_INCLUDE_DIR}) + set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES}) + else(Tcmalloc_FOUND) + message(WARNING "Cannot find TCMalloc library. Continuing.") + endif(Tcmalloc_FOUND) +endif() + ############################################################################### # Find BLAS library if(COMPILE_CPU) diff --git a/VERSION b/VERSION index ba9cd1cd2..e219c7fab 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1 @@ v1.10.14 - diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 9a809d5ec..2bef31296 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -74,9 +74,13 @@ if(USE_SENTENCEPIECE) if(NOT GENERATE_MARIAN_INSTALL_TARGETS) set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE) endif() + set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.") - # disable TCMALLOC for SentencePiece - set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "Enable TCMalloc if available." FORCE) + if(USE_STATIC_LIBS) + set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE) + else(USE_STATIC_LIBS) + set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.") + endif(USE_STATIC_LIBS) add_subdirectory(./sentencepiece) include_directories(./sentencepiece) From 0223ce90b1d6afaa047f2baeb4d0689f87d7ae81 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 8 Apr 2021 18:41:15 +0100 Subject: [PATCH 004/254] Fix Ubuntu GitHub checks (#848) * Change ubuntu-latest to ubuntu-18.04 * Install gcc/g++ --- .github/workflows/ubuntu.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 6bdff5534..5353c2144 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -13,7 +13,7 @@ jobs: include: # Ubuntu CPU-only build - name: "Ubuntu CPU-only" - os: ubuntu-latest + os: ubuntu-18.04 cuda: "" gcc: 7 cpu: true @@ -72,7 +72,9 @@ jobs: # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev # Boost is no longer pre-installed on GitHub-hosted runners - name: Install dependencies - run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev + run: | + sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev \ + gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL From fddd0e0661eb13b2a132e401e268315a35e468f7 Mon Sep 17 00:00:00 2001 From: rhenry-nv <72179960+rhenry-nv@users.noreply.github.com> Date: Thu, 8 Apr 2021 21:46:27 -0700 Subject: [PATCH 005/254] Adds better Affine support for GPUs when using CUDA 11. Introduces a new bias addition kernel for CUDA < 11 (#778) Co-authored-by: Marcin Junczys-Dowmunt --- CHANGELOG.md | 2 + CMakeLists.txt | 14 +- src/CMakeLists.txt | 1 + src/graph/expression_operators.cpp | 12 +- src/graph/expression_operators.h | 12 +- src/graph/node_operators_binary.h | 124 ++++++++++++++--- src/layers/generic.h | 25 +++- src/layers/output.cpp | 2 +- src/models/transformer.h | 27 +--- src/tensors/cpu/prod.cpp | 17 +++ src/tensors/dispatch.h | 42 ++++++ src/tensors/gpu/prod.cpp | 217 +++++++++++++++++++++++++++++ src/tensors/gpu/prod.cu | 69 +++++++++ src/tensors/gpu/prod.h | 15 ++ src/tensors/tensor_operators.h | 2 + src/tests/units/operator_tests.cpp | 21 ++- 16 files changed, 551 insertions(+), 51 deletions(-) create mode 100644 src/tensors/gpu/prod.cu diff --git a/CHANGELOG.md b/CHANGELOG.md index 363244531..bfdfe5107 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Adds custom bias epilogue kernel. +- Adds support for fusing relu and bias addition into gemms when using cuda 11. - Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special - Display decoder time statistics with marian-decoder --stat-freq 10 ... - Support for MS-internal binary shortlist diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ee339781..7c50681f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -347,8 +347,20 @@ if(CUDA_FOUND) endif() message(STATUS "Found CUDA libraries: ${CUDA_LIBS}") else(USE_STATIC_LIBS) + set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + # We actually only need cublasLt here after cuda 11. Marian will work fine without it pre cuda 11. We want to force CMake to use the cublas + # version that ships with CUDA 11 so we force the search to occur inside of the cuda toolkit directory. + set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")) + find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH) + if(NOT CUDA_cublasLt_LIBRARY) + message(FATAL_ERROR "cuBLASLt library not found") + endif() + set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY}) + set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY}) + endif() set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) - message(STATUS "Found CUDA libraries: ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}") + message(STATUS "Found CUDA libraries: ${CUDA_LIBS}") endif(USE_STATIC_LIBS) if(USE_CUDNN) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 64b86a695..cf276137d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -175,6 +175,7 @@ if(CUDA_FOUND) tensors/gpu/device.cu tensors/gpu/algorithm.cu tensors/gpu/prod.cpp + tensors/gpu/prod.cu tensors/gpu/prod_sparse.cpp tensors/gpu/topk.cu tensors/gpu/element.cu diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index f354caabc..048c74789 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -1,4 +1,5 @@ #include "graph/expression_operators.h" +#include "common/definitions.h" #include "layers/constructors.h" #include "graph/node_operators.h" @@ -518,7 +519,7 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) { return Expression(a, b, transA, transB, scale); } -static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { +Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // general version, MKL, CBlas or CUDA int rows = a->shape().elements() / a->shape()[-1]; @@ -577,6 +578,15 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { } } +Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { + auto graph = a->graph(); + + if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu) + return Expression(a, b, bias, transA, transB, scale); + else + return relu(affine(a, b, bias, transA, transB, scale)); +} + // @TODO: Not a great place to check this #if CUDA_VERSION < 11000 // multiply a CSR matrix A with a matrix B diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index ca0739e44..81b0f5ea2 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -488,11 +488,21 @@ Expr bdot(Expr a, */ Expr affine(Expr a, Expr b, - Expr c, + Expr bias, bool transA = false, bool transB = false, float scalar = 1.f); +/** + * As above, but efficiently applies relu transformation to output. For inference only. + */ +Expr affineWithRelu(Expr a, + Expr b, + Expr bias, + bool transA = false, + bool transB = false, + float scalar = 1.f); + /** * Computes the dot product of CSR-tensor @p A with @p B. */ diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 261885ec4..55f105a96 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -266,17 +266,18 @@ class AffineNodeOp : public NaryNodeOp { NodeOps forwardOps() override { using namespace functional; - + return { - NodeOp( - Prod(val_, - child(0)->val(), - child(1)->val(), - transA_, - transB_, - 0.f, - scalar_); - Prod(val_, child(3)->val(), child(2)->val(), false, false, 1.f, 1.f)) + NodeOp(Affine(val_, + graph()->allocator(), + child(0)->val(), + child(1)->val(), + child(2)->val(), + transA_, + transB_, + 0.f, + scalar_, + /*doRelu=*/false)) }; } @@ -323,8 +324,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod( - child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) }; if(transA_ && !transB_) @@ -343,8 +343,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod( - child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) }; if(transA_ && transB_) @@ -363,8 +362,7 @@ class AffineNodeOp : public NaryNodeOp { true, 1.0, scalar_, computeTypeB)), - NodeOp(Prod( - child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) }; return { @@ -382,8 +380,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod( - child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) }; } @@ -414,6 +411,97 @@ class AffineNodeOp : public NaryNodeOp { }; +class AffineWithReluNodeOp : public NaryNodeOp { +private: + friend class SerializationHelpers; + bool transA_; + bool transB_; + float scalar_; + +public: + AffineWithReluNodeOp(Expr a, + Expr b, + Expr bias, + bool transA, + bool transB, + float scalar) + : NaryNodeOp({a, b, bias}, newShape(a, b, transA, transB)), + transA_(transA), + transB_(transB), + scalar_(scalar) { + ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu, + "AffineWithReluNodeOp currently only supported for inference on GPU"); + } + + Shape newShape(Expr a, Expr b, bool transA, bool transB) { + auto shapeA = a->shape(); + if(transA) { + shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]); + shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]); + } + + auto shapeB = b->shape(); + if(transB) { + shapeB.set(shapeB.size() - 2, b->shape()[shapeB.size() - 1]); + shapeB.set(shapeB.size() - 1, b->shape()[shapeB.size() - 2]); + } + + Shape outShape = shapeA; + outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]); + ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2], + "Matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB); + return outShape; + } + + NodeOps forwardOps() override { + ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu, + "AffineWithReluNodeOp currently only supported for inference on GPU"); + + return { + NodeOp(Affine(val_, + graph()->allocator(), + child(0)->val(), + child(1)->val(), + child(2)->val(), + transA_, + transB_, + 0.f, + scalar_, + /*doRelu=*/true)) + }; + } + + NodeOps backwardOps() override { + ABORT("AffineWithReluNodeOp cannot be used for training??"); + return {}; + } + + const std::string type() override { return "affineWithRelu"; } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, transA_); + util::hash_combine(seed, transB_); + util::hash_combine(seed, scalar_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transA_ != cnode->transA_) + return false; + if(transB_ != cnode->transB_) + return false; + if(scalar_ != cnode->scalar_) + return false; + return true; + } +}; + class DotBatchedNodeOp : public NaryNodeOp { private: friend class SerializationHelpers; diff --git a/src/layers/generic.h b/src/layers/generic.h index 89f5c1e9d..5eb936151 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -1,5 +1,7 @@ #pragma once +#include "common/definitions.h" +#include "graph/expression_operators.h" #include "marian.h" #include "data/shortlist.h" @@ -168,22 +170,37 @@ class Dense : public LayerBase, public IUnaryLayer { // --- a few layers with built-in parameters created on the fly, without proper object // @TODO: change to a proper layer object +static inline std::function activationByName(const std::string& actName) { + if (actName == "relu") + return (ActivationFunction*)relu; + else if (actName == "swish") + return (ActivationFunction*)swish; + else if (actName == "gelu") + return (ActivationFunction*)gelu; + else if (actName == "") // return identity function if activation name is empty + return [](Expr x) { return x; }; + ABORT("Invalid activation name '{}'", actName); +} + // like affine() but with built-in parameters, activation, and dropout static inline Expr denseInline(Expr x, std::string prefix, std::string suffix, int outDim, Ptr initFn = inits::glorotUniform(), - const std::function& actFn = nullptr, + std::string actName = "", float dropProb = 0.0f) { auto graph = x->graph(); auto W = graph->param(prefix + "_W" + suffix, {x->shape()[-1], outDim}, inits::glorotUniform()); auto b = graph->param(prefix + "_b" + suffix, {1, outDim}, inits::zeros()); - x = affine(x, W, b); - if(actFn) - x = actFn(x); + if(actName == "relu") { + x = affineWithRelu(x, W, b); // speed optimization for inference, @TODO: handle better in future layer framework + } else { + x = affine(x, W, b); + x = activationByName(actName)(x); + } x = dropout(x, dropProb); // @TODO: check for infernce? return x; } diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 1d9c7b4b0..4c34bdcea 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -170,7 +170,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { /*suffix=*/"1", ffnDim, inits::glorotUniform(), - (ActivationFunction*)relu, + "relu", ffnDropProb); f = denseInline(f, name + "_ffn", /*suffix=*/"2", inputDim); // add & norm diff --git a/src/models/transformer.h b/src/models/transformer.h index 6368cc6a1..79b59000a 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -396,18 +396,6 @@ class Transformer : public EncoderOrDecoderBase { opt("transformer-heads"), /*cache=*/false); } - static inline - std::function activationByName(const std::string& actName) - { - if (actName == "relu") - return (ActivationFunction*)relu; - else if (actName == "swish") - return (ActivationFunction*)swish; - else if (actName == "gelu") - return (ActivationFunction*)gelu; - ABORT("Invalid activation name '{}'", actName); - } - Expr LayerFFN(std::string prefix, Expr input) const { int dimModel = input->shape()[-1]; @@ -415,9 +403,9 @@ class Transformer : public EncoderOrDecoderBase { auto opsPre = opt("transformer-preprocess"); auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb); + auto actName = opt("transformer-ffn-activation"); int dimFfn = opt("transformer-dim-ffn"); int depthFfn = opt("transformer-ffn-depth"); - auto actFn = activationByName(opt("transformer-ffn-activation")); float ffnDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); @@ -427,12 +415,11 @@ class Transformer : public EncoderOrDecoderBase { // the stack of FF layers for(int i = 1; i < depthFfn; ++i) - output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actFn, ffnDropProb); + output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actName, ffnDropProb); output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel, initFn); auto opsPost = opt("transformer-postprocess"); - output - = postProcess(prefix + "_ffn", opsPost, output, input, dropProb); + output = postProcess(prefix + "_ffn", opsPost, output, input, dropProb); return output; } @@ -450,21 +437,21 @@ class Transformer : public EncoderOrDecoderBase { // FFN int dimAan = opt("transformer-dim-aan"); int depthAan = opt("transformer-aan-depth"); - auto actFn = activationByName(opt("transformer-aan-activation")); + auto actName = opt("transformer-aan-activation"); float aanDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f); // the stack of AAN layers for(int i = 1; i < depthAan; ++i) - y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actFn, aanDropProb); + y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, initFn, actName, aanDropProb); if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed y = denseInline(y, prefix, std::to_string(depthAan), dimModel, initFn); bool noGate = opt("transformer-aan-nogate"); if(!noGate) { - auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, (ActivationFunction*)sigmoid); - auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, (ActivationFunction*)sigmoid); + auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, initFn, "sigmoid"); + auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, initFn, "sigmoid"); y = gi * x + gf * y; } diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index f77337d65..6e28bdd23 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -212,6 +212,23 @@ void ProdWithBias(marian::Tensor C, cpu::integer::AddBias(C, bias); } +void Affine(marian::Tensor C, + Ptr /*allocator*/, + const marian::Tensor& A, + const marian::Tensor& B, + const marian::Tensor& bias, + bool transA, + bool transB, + float beta, + float scalar, + bool reluPostprocess) { + using namespace functional; + ProdWithBias(C, A, B, bias, transA, transB, beta, scalar); + if(reluPostprocess) + cpu::Element(_1 = ReLU(_1), C); // @TODO: also fuse with AddBias +} + + void CSRProd(marian::Tensor C, Ptr /*allocator*/, const marian::Tensor& S_values, diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h index 094f156cb..f71543511 100644 --- a/src/tensors/dispatch.h +++ b/src/tensors/dispatch.h @@ -152,6 +152,30 @@ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ } +#define DISPATCH10( \ + Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9, Arg10) \ +namespace gpu { \ +void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9, Arg10); \ +} \ +namespace cpu { \ +void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9, Arg10); \ +} \ +static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8, \ + Arg9 arg9, \ + Arg10 arg10) { \ + if(arg1->getBackend()->getDeviceId().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); \ +} + #else #define DISPATCH1(Function, Arg1) \ @@ -248,4 +272,22 @@ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ } +#define DISPATCH10( \ + Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9, Arg10) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9, Arg10); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8, \ + Arg9 arg9, \ + Arg10 arg10) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10); \ + } + #endif diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index bf7e5512c..8cfa78cab 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -11,10 +11,34 @@ #include "tensors/gpu/cuda_helpers.h" // clang-format on +#if CUDA_VERSION >= 11000 +#include +#endif + namespace marian { namespace gpu { +// It seems that the bias must be 8 byte aligned for the cublasLt epilogue to work. Therefore, +// if the bias pointer is not 8 byte aligned, we do a normal matmul in cublasLt and invoke a +// custom epilogue kernel. +static constexpr int REQUIRED_BIAS_ALIGNMENT = 8; + +// Used to set preferences for cublasLt to filter out algos if matrices to not meet default 256 byte alignment +int getAlignmentUpTo256(const void *ptr) { + uintptr_t addr = (uintptr_t)ptr; + int trailingZeros = 0; + + for(int shiftAmt = 8, mask = 0xFF; shiftAmt > 0; shiftAmt /= 2, mask >>=shiftAmt) { + if ((addr & mask) == 0) { + trailingZeros += shiftAmt; + addr >>= shiftAmt; + } + } + + return std::min(256, 1 << trailingZeros); +} + // The explicit version of matmult like cublasGemmEx choose their math mode based on the algorithm that // has been passed into the function call and seem to ignore setMathMode. Here we query the used math mode // to choose the algorithm. @@ -412,5 +436,198 @@ void ProdBatched(marian::Tensor C, } } +#if CUDA_VERSION >= 11000 // Earlier versions of cublasLT do not support bias addition for fp32 and fp16. + +static cublasStatus_t cublasLtAffineHelper(cublasLtHandle_t ltHandle, cublasOperation_t transA, cublasOperation_t transB, + cudaDataType matrixType, + int m, int n, int k, const void *alpha, const void *A, int lda, const void *B, + int ldb, const void *beta, void *C, int ldc, const void* bias, + void* workspace, size_t workspaceSize, bool do_relu, cudaStream_t stream) { + + cublasLtMatmulDesc_t operationDesc = NULL; + cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; + cublasLtMatmulPreference_t preference = NULL; + + int returnedResults = 0; + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + + cublasLtEpilogue_t epilogue = do_relu? CUBLASLT_EPILOGUE_RELU_BIAS: CUBLASLT_EPILOGUE_BIAS; + cublasComputeType_t computeType = matrixType == CUDA_R_32F? CUBLAS_COMPUTE_32F_FAST_16F: CUBLAS_COMPUTE_16F; + + // If the bias is not aligned, just matmul and invoke custom epilogue later. + // cublas fails with a misalignment error if this condition is not true. + if((uintptr_t)bias % REQUIRED_BIAS_ALIGNMENT != 0) { + epilogue = CUBLASLT_EPILOGUE_DEFAULT; + } + + CUBLAS_CHECK(cublasLtMatmulDescCreate(&operationDesc, computeType, matrixType)); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias))); + + CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&Adesc, matrixType, transA == CUBLAS_OP_N ? m : k, transA == CUBLAS_OP_N ? k : m, lda)); + CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&Bdesc, matrixType, transB == CUBLAS_OP_N ? k : n, transB == CUBLAS_OP_N ? n : k, ldb)); + CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&Cdesc, matrixType, m, n, ldc)); + + // I think we need to do this since we can slice matrices... + // The allocator always allocates on 256 byte boundaries but we have no guarantees about the alignment of a matrix slice so we filter out + // algorithms that would not work with matrices not aligned to 256 bytes. + int alignmentA = getAlignmentUpTo256(A); + int alignmentB = getAlignmentUpTo256(B); + int alignmentC = getAlignmentUpTo256(C); + + CUBLAS_CHECK(cublasLtMatmulPreferenceCreate(&preference)); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize))); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, &alignmentA, sizeof(alignmentA))); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, &alignmentB, sizeof(alignmentB))); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, &alignmentC, sizeof(alignmentC))); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, &alignmentC, sizeof(alignmentC))); + CUBLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, preference, 1, &heuristicResult, &returnedResults)); + + cublasStatus_t opStatus = cublasLtMatmul(ltHandle, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, + &heuristicResult.algo, workspace, workspaceSize, stream); + + if (preference) CUBLAS_CHECK(cublasLtMatmulPreferenceDestroy(preference)); + if (Cdesc) CUBLAS_CHECK(cublasLtMatrixLayoutDestroy(Cdesc)); + if (Bdesc) CUBLAS_CHECK(cublasLtMatrixLayoutDestroy(Bdesc)); + if (Adesc) CUBLAS_CHECK(cublasLtMatrixLayoutDestroy(Adesc)); + if (operationDesc) CUBLAS_CHECK(cublasLtMatmulDescDestroy(operationDesc)); + + return opStatus; +} + +static cublasStatus_t cublasLtAffineTyped(cublasLtHandle_t ltHandle, cublasOperation_t transA, cublasOperation_t transB, + int m, int n, int k, const half *alpha, const half *A, int lda, const half *B, + int ldb, const half *beta, half *C, int ldc, const half* bias, + half* workspace, size_t workspaceSizeBytes, bool do_relu, cudaStream_t stream) { + return cublasLtAffineHelper(ltHandle, transA, transB, CUDA_R_16F, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, bias, + workspace, workspaceSizeBytes, do_relu, stream); +} + +static cublasStatus_t cublasLtAffineTyped(cublasLtHandle_t ltHandle, cublasOperation_t transA, cublasOperation_t transB, + int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, + int ldb, const float *beta, float *C, int ldc, const float* bias, + float* workspace, size_t workspaceSizeBytes,bool do_relu, cudaStream_t stream) { + + return cublasLtAffineHelper(ltHandle, transA, transB, CUDA_R_32F, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, bias, + workspace, workspaceSizeBytes, do_relu, stream); +} + +template +void affineTyped(marian::Tensor C, Ptr allocator, const marian::Tensor& A, const marian::Tensor& B, const marian::Tensor& bias, + bool transA, bool transB, T beta, T scalar, bool do_relu) { + + CUDA_CHECK(cudaSetDevice((int)C->getDeviceId().no)); + T alpha = scalar; + + int m = A->shape().elements() / A->shape().back(); + int k = A->shape().back(); + if(transA) + std::swap(m, k); + + int l = B->shape().elements() / B->shape().back(); + int n = B->shape().back(); + if(transB) + std::swap(l, n); + + int lda = A->shape().back(); + int ldb = B->shape().back(); + int ldc = B->shape().back(); + + size_t bias_size = bias->shape().elements(); + ABORT_IF(n != bias_size, "The number of elements in the bias must match the number of columns in C"); + + if(transB) + ldc = B->shape().elements() / B->shape().back(); + + cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + auto backend = std::static_pointer_cast(C->getBackend()); + auto cublasHandle = backend->getCublasHandle(); + auto ltHandle = (cublasLtHandle_t)backend->getCublasHandle(); // A cublas handle encapsulates an lt handle + + size_t numWorkSpaceElts = 8192; // Allows for cublasLt to perform split-K gemms. This is chosen to be at least + // 16 KiB for float16 which is large enough to prevent alloc failed errors + size_t workspaceSizeBytes = numWorkSpaceElts * sizeof(T); + IPtr workspace = allocator->alloc(numWorkSpaceElts); + + cudaStream_t stream = 0; + CUBLAS_CHECK(cublasGetStream(cublasHandle, &stream)); + + + CUBLAS_CHECK(cublasLtAffineTyped(ltHandle, + opB, + opA, + n, + m, + k, + &alpha, + B->data(), + ldb, + A->data(), + lda, + &beta, + C->data(), + ldc, + bias->data(), + workspace->data(), + workspaceSizeBytes, + do_relu, + stream)); + + allocator->free(workspace); +} + +// This version is needed so that Windows doesn't complain when compiling CUDA < 11. Otherwise, the ifdef could be inside of one +// definition of Affine. +void Affine(marian::Tensor C, + Ptr allocator, + const marian::Tensor& A, + const marian::Tensor& B, + const marian::Tensor& bias, + bool transA, bool transB, float beta, float scalar, bool do_relu) { + // There is a bug in CUDA 11 where the bias pointer needs to be 8 byte aligned. This bug will be fix in a subsequent release. For now, + // we launch a custom epilogue if the bias does not meet the alignment requirement. + if(C->type() == Type::float32) { + affineTyped(C, allocator, A, B, bias, transA, transB, beta, scalar, do_relu); + if((uintptr_t)bias->data() % REQUIRED_BIAS_ALIGNMENT != 0) { + BiasAdd(C, bias, do_relu); + } +#if COMPILE_FP16 + } else if(C->type() == Type::float16) { + affineTyped(C, allocator, A, B, bias, transA, transB, __float2half(beta), __float2half(scalar), do_relu); + if((uintptr_t)bias->data() % REQUIRED_BIAS_ALIGNMENT != 0) { + BiasAdd(C, bias, do_relu); + } +#endif + } else { + ABORT("Affine not implemented for type {}", C->type()); + } +} + +#else + +void Affine(marian::Tensor C, + Ptr /*allocator*/, + const marian::Tensor& A, + const marian::Tensor& B, + const marian::Tensor& bias, + bool transA, bool transB, float beta, float scalar, bool do_relu) { + + if(C->type() == Type::float32) { + ProdTyped(C, A, B, transA, transB, beta, scalar); +#if COMPILE_FP16 + } else if(C->type() == Type::float16) { + ProdTyped(C, A, B, transA, transB, __float2half(beta), __float2half(scalar)); +#endif + } else { + ABORT("Prod not implemented for type {}", C->type()); + } + BiasAdd(C, bias, do_relu); +} +#endif + } // namespace gpu } // namespace marian diff --git a/src/tensors/gpu/prod.cu b/src/tensors/gpu/prod.cu new file mode 100644 index 000000000..ec01d57e9 --- /dev/null +++ b/src/tensors/gpu/prod.cu @@ -0,0 +1,69 @@ +#include +#include "tensors/tensor.h" +#include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/backend.h" + +namespace marian { +namespace gpu { + +template +__global__ static void gBiasAddFused(T* tensor, T* bias, size_t tensor_size, size_t bias_size, ActFunc f) { + const size_t row_start = blockIdx.x * bias_size; + for(int bias_offset = threadIdx.x; bias_offset < bias_size; bias_offset+=blockDim.x) { + size_t offset_into_tensor = row_start + bias_offset; + if(offset_into_tensor < tensor_size) { + T added_bias = tensor[offset_into_tensor] + bias[bias_offset]; + tensor[offset_into_tensor] = f(added_bias); + } + } +} + +struct identity { + template + __device__ constexpr T&& operator() (T&& t) const noexcept { + return std::forward(t); + } +}; + +struct reluAct { + template + __device__ T operator() (T t) const noexcept { + return t > (T) 0? t : (T) 0; + } +}; + +void BiasAdd(marian::Tensor C, const marian::Tensor& bias, bool do_relu) { + auto backend = std::static_pointer_cast(C->getBackend()); + CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); + + size_t size = C->shape().elements(); + size_t bias_size = bias->shape().elements(); + + int m = C->shape().elements() / C->shape().back(); + int n = C->shape().back(); + + ABORT_IF(n != bias_size, "The number of elements in the bias must match the number of columns in C"); + + int threads_per_block = std::min(MAX_THREADS, n); + int blocks = m; + + if(C->type() == Type::float32) { + if (do_relu) + gBiasAddFused<<>>(C->data(), bias->data(), size, bias_size, reluAct()); + else + gBiasAddFused<<>>(C->data(), bias->data(), size, bias_size, identity()); + +#if COMPILE_FP16 + } else if(C->type() == Type::float16) { + if (do_relu) + gBiasAddFused<<>>(C->data(), bias->data(), size, bias_size, reluAct()); + else + gBiasAddFused<<>>(C->data(), bias->data(), size, bias_size, identity()); +#endif + } else { + ABORT("Prod not implemented for type {}", C->type()); + } +} + +} +} \ No newline at end of file diff --git a/src/tensors/gpu/prod.h b/src/tensors/gpu/prod.h index 63b9192a5..aec8cb738 100644 --- a/src/tensors/gpu/prod.h +++ b/src/tensors/gpu/prod.h @@ -6,6 +6,21 @@ namespace marian { namespace gpu { +void BiasAdd(marian::Tensor C, + const marian::Tensor& bias, + bool do_relu = false); + +void Affine(marian::Tensor C, + Ptr allocator, + const marian::Tensor& A, + const marian::Tensor& B, + const marian::Tensor& bias, + bool transA, + bool transB, + float beta = 0, + float scalar = 1, + bool do_relu = false); + void Prod(marian::Tensor C, const marian::Tensor& A, const marian::Tensor& B, diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 83bce8194..af7946dde 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -106,6 +106,8 @@ DISPATCH8(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bo DISPATCH8(ProdBatched, marian::Tensor, Ptr, const marian::Tensor, const marian::Tensor, bool, bool, float, float) DISPATCH9(CSRProd, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float) +DISPATCH10(Affine, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float, bool) + DISPATCH2(Softmax, marian::Tensor, marian::Tensor) DISPATCH3(SoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor) diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index 27ccf1396..c3fd4a9e7 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -32,6 +32,8 @@ void tests(DeviceType device, Type floatType = Type::float32) { Config::seed = 1234; auto graph = New(); + + graph->setInference(true); graph->setDefaultElementType(floatType); graph->setDevice({0, device}); graph->reserveWorkspaceMB(16); @@ -539,15 +541,19 @@ void tests(DeviceType device, Type floatType = Type::float32) { values.clear(); std::vector vA({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); - std::vector vB({1, 2, 3, 4, 5, 6}); - std::vector vAff({24, 30, 51, 66, 78, 102, 105, 138}); + std::vector vB({1, -2, 3, 4, -5, 6}); + std::vector vAff({-6, 26, -9, 50, -12, 74, -15, 98}); + std::vector vAffRelu({0, 26, 0, 50, 0, 74, 0, 98}); auto A = graph->param("A", {4, 3}, inits::fromVector(vA)); auto B = graph->param("B", {3, 2}, inits::fromVector(vB)); - auto C = graph->param("C", {4, 2}, inits::fromValue(2)); + auto bias = graph->param("C", {1, 2}, inits::fromValue(2)); + + auto aff1 = affine(A, B, bias); + auto aff2 = dot(A, B) + bias; - auto aff1 = affine(A, B, C); - auto aff2 = dot(A, B) + C; + auto affRelu1 = affineWithRelu(A, B, bias); + auto affRelu2 = relu(dot(A, B) + bias); graph->forward(); @@ -559,6 +565,11 @@ void tests(DeviceType device, Type floatType = Type::float32) { CHECK(aff2->shape() == aff1->shape()); aff2->val()->get(values2); CHECK(values2 == values); + + affRelu1->val()->get(values); + affRelu2->val()->get(values2); + CHECK(values2 == vAffRelu); + CHECK(values2 == values); } SECTION("repeat") { From a17ee300f4567cb747a93b0559d48b664d0e882e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 8 Apr 2021 21:48:01 -0700 Subject: [PATCH 006/254] Create VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index e219c7fab..715ea6990 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.10.14 +v1.10.15 From be6506562321be49f8ef4b8c2a469662bbc1cfa4 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 9 Apr 2021 09:02:34 -0700 Subject: [PATCH 007/254] Allow to choose fine-grained CPU intrinsics on as CMake options (#849) * allow to choose fine-grained CPU intrinsics on as CMake options * inform user that e.g. -DCOMPILE_AVX2=off will be ignored with -march=native if there is compiler support --- CHANGELOG.md | 2 ++ CMakeLists.txt | 79 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfdfe5107..9182057c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Allow for fine-grained CPU intrinsics overrides when BUILD_ARCH != native e.g. -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off - Adds custom bias epilogue kernel. - Adds support for fusing relu and bias addition into gemms when using cuda 11. - Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special @@ -36,6 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Broken links to MNIST data sets ### Changed +- For BUILD_ARCH != native enable all intrinsics types by default, can be disabled like this: -DCOMPILE_AVX512=off - Moved FBGEMM pointer to commit c258054 for gcc 9.3+ fix - Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL, -DCOMPILE_PASCAL, -DCOMPILE_VOLTA, -DCOMPILE_TURING and -DCOMPILE_AMPERE diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c50681f2..79c8585e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,50 +124,81 @@ else(MSVC) # Detect support CPU instrinsics for the current platform. This will # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we - # minimally use -msse4.1. This seems to work with MKL. + # force intrinsics as set in the options. set(INTRINSICS "") list(APPEND INTRINSICS_NVCC) + option(COMPILE_SSE2 "Compile CPU code with SSE2 support" ON) + option(COMPILE_SSE3 "Compile CPU code with SSE3 support" ON) + option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON) + option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON) + option(COMPILE_AVX "Compile CPU code with AVX support" ON) + option(COMPILE_AVX2 "Compile CPU code with AVX2 support" ON) + option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON) + if(BUILD_ARCH STREQUAL "native") + message(STATUS "Building with -march=native and intrinsics will be chosen automatically by the compiler to match the current machine.") message(STATUS "Checking support for CPU intrinsics") include(FindSSE) - if(SSE2_FOUND) - message(STATUS "SSE2 support found") + if(SSE2_FOUND AND NOT COMPILE_SSE2) + message(WARNING "SSE2 enabled due to -march=native and -DCOMPILE_SSE2=${COMPILE_SSE2} is ignored.") + endif(SSE2_FOUND AND NOT COMPILE_SSE2) + if(SSE3_FOUND AND NOT COMPILE_SSE3) + message(WARNING "SSE3 enabled due to -march=native and -DCOMPILE_SSE3=${COMPILE_SSE3} is ignored.") + endif(SSE3_FOUND AND NOT COMPILE_SSE3) + if(SSE4_1_FOUND AND NOT COMPILE_SSE4_1) + message(WARNING "SSE4.1 enabled due to -march=native and -DCOMPILE_SSE4_1=${COMPILE_SSE4_1} is ignored.") + endif(SSE4_1_FOUND AND NOT COMPILE_SSE4_1) + if(SSE4_2_FOUND AND NOT COMPILE_SSE4_2) + message(WARNING "SSE4.2 enabled due to -march=native and -DCOMPILE_SSE4_2=${COMPILE_SSE4_2} is ignored.") + endif(SSE4_2_FOUND AND NOT COMPILE_SSE4_2) + if(AVX_FOUND AND NOT COMPILE_AVX) + message(WARNING "AVX enabled due to -march=native and -DCOMPILE_AVX=${COMPILE_AVX} is ignored.") + endif(AVX_FOUND AND NOT COMPILE_AVX) + if(AVX2_FOUND AND NOT COMPILE_AVX2) + message(WARNING "AVX2 enabled due to -march=native and -DCOMPILE_AVX2=${COMPILE_AVX2} is ignored.") + endif(AVX2_FOUND AND NOT COMPILE_AVX2) + if(AVX512_FOUND AND NOT COMPILE_AVX512) + message(WARNING "AVX512 enabled due to -march=native and -DCOMPILE_AVX512=${COMPILE_AVX512} is ignored.") + endif(AVX512_FOUND AND NOT COMPILE_AVX512) + else() + # force to build with the requested intrisics, requires compiler support + message(STATUS "Building with -march=${BUILD_ARCH} and forcing intrisics as requested") + if(COMPILE_SSE2) + message(STATUS "SSE2 support requested") set(INTRINSICS "${INTRINSICS} -msse2") list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse2) - endif(SSE2_FOUND) - if(SSE3_FOUND) - message(STATUS "SSE3 support found") + endif(COMPILE_SSE2) + if(COMPILE_SSE3) + message(STATUS "SSE3 support requested") set(INTRINSICS "${INTRINSICS} -msse3") list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse3) - endif(SSE3_FOUND) - if(SSE4_1_FOUND) - message(STATUS "SSE4.1 support found") + endif(COMPILE_SSE3) + if(COMPILE_SSE4_1) + message(STATUS "SSE4.1 support requested") set(INTRINSICS "${INTRINSICS} -msse4.1") list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1) - endif(SSE4_1_FOUND) - if(SSE4_2_FOUND) - message(STATUS "SSE4.2 support found") + endif(COMPILE_SSE4_1) + if(COMPILE_SSE4_2) + message(STATUS "SSE4.2 support requested") set(INTRINSICS "${INTRINSICS} -msse4.2") list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.2) - endif(SSE4_2_FOUND) - if(AVX_FOUND) - message(STATUS "AVX support found") + endif(COMPILE_SSE4_2) + if(COMPILE_AVX) + message(STATUS "AVX support requested") set(INTRINSICS "${INTRINSICS} -mavx") list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx) - endif(AVX_FOUND) - if(AVX2_FOUND) - message(STATUS "AVX2 support found") + endif(COMPILE_AVX) + if(COMPILE_AVX2) + message(STATUS "AVX2 support requested") set(INTRINSICS "${INTRINSICS} -mavx2") list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx2) - endif(AVX2_FOUND) - if(AVX512_FOUND) - message(STATUS "AVX512 support found") + endif(COMPILE_AVX2) + if(COMPILE_AVX512) + message(STATUS "AVX512 support requested") set(INTRINSICS "${INTRINSICS} -mavx512f") list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx512f) - endif(AVX512_FOUND) - else() - set(INTRINSICS "-msse4.1") + endif(COMPILE_AVX512) endif() if(USE_FBGEMM) From fdf9fe7d4a98268b5d451bc5b5944da959fbe8e2 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 9 Apr 2021 09:03:39 -0700 Subject: [PATCH 008/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 715ea6990..8a8c98ae3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.10.15 +v1.10.16 From a05124176d8869962b717a3557c383406f8c76f4 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 9 Apr 2021 18:44:11 +0000 Subject: [PATCH 009/254] Merged PR 18531: Install GCC in Azure pipelines This fixes Azure pipelines after recent changes in Azure-hosted runners removing GCC 8 and older on some Ubuntu images. GCC is now installed explicitly via `apt-get`. --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a32a82884..dfed6ab40 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -235,11 +235,11 @@ stages: # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler + # GCC 8 and lower are no longer pre-installed + - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-$(gcc) g++-$(gcc) displayName: Install packages # Boost is no longer pre-installed on Azure/GitHub-hosted runners - # TODO: check which Boost components are really needed and update the list - bash: sudo apt-get install -y libboost-system-dev displayName: Install Boost condition: eq(variables.boost, true) From caddad90cdb06283da2f5d17a3340ca8c6387b38 Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Sat, 10 Apr 2021 15:28:38 +0000 Subject: [PATCH 010/254] Merged PR 18505: RMSNorm on GPU Support for RMSNorm as drop-in replace for LayerNorm from _Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization_. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. --- CHANGELOG.md | 1 + VERSION | 2 +- src/graph/expression_graph.cpp | 11 +- src/graph/expression_operators.cpp | 12 ++ src/graph/expression_operators.h | 12 ++ src/graph/node_operators_binary.h | 58 ++++++ src/layers/generic.h | 6 + src/models/transformer.h | 4 + src/tensors/cpu/tensor_operators.cpp | 196 ++++++++++++++++++-- src/tensors/gpu/tensor_operators.cu | 267 +++++++++++++++++++++++++++ src/tensors/tensor_operators.h | 49 +++++ src/tests/units/operator_tests.cpp | 43 +++++ 12 files changed, 638 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6884d5f8..ce4bef246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. - Extend suppression of unwanted output symbols, specifically "\n" from default vocabulary if generated by SentencePiece with byte-fallback. Deactivates with --allow-special - Allow for fine-grained CPU intrinsics overrides when BUILD_ARCH != native e.g. -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off - Adds custom bias epilogue kernel. diff --git a/VERSION b/VERSION index 8a8c98ae3..7af87e875 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.10.16 +v1.10.17 diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 827fb3ed8..12a1195e1 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -208,8 +208,15 @@ void ExpressionGraph::backward(bool reset, float clipValue) { } if(v->trainable() && v->marked_for_debug()) { - LOG(info, "Debug Grad: {} op={}", v->debug_message(), v->type()); - LOG(info, v->grad()->debug()); + Logger log = spdlog::get("general"); + if(log) { + LOG(info, "Debug Grad: {} op={}", v->debug_message(), v->type()); + LOG(info, v->grad()->debug()); + } + else { + std::cerr << "Debug Grad: " << v->debug_message() << " op=" << v->type() << std::endl; + std::cerr << v->grad()->debug() << std::endl; + } } if(v->trainable() && clipValue != 0) { diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 048c74789..6c7ef91ce 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -749,6 +749,18 @@ Expr layerNorm(Expr x, return Expression(nodes, eps); } +Expr rmsNorm(Expr x, + Expr gamma, + Expr beta /*= nullptr*/, + float eps /*= 1e-9*/) { + + // layerNorm accumulates in float, so small eps is fine + std::vector nodes = {x, gamma}; + if(beta) + nodes.push_back(beta); + return Expression(nodes, eps); +} + Expr highway(Expr y, Expr x, Expr t) { std::vector nodes = {y, x, t}; return Expression(nodes); diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 81b0f5ea2..f3d84eb6b 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -915,6 +915,18 @@ Expr weighted_average(Expr in, Expr weights, int ax = 0); */ Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); +/** + * Applies RMS normalization over the last dimension. + * + * See: Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization. + * In Advances in Neural Information Processing Systems 32. Vancouver, Canada. + * @f[ + \frac{x}{\sqrt{\frac{1}{N}\sum x^2 + \mathrm{eps}}} \times \gamma + \beta + * @f] + * @see RMSNormalizationOp + */ +Expr rmsNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); + /** * Highway transformation. * Computes the highway tranform on @p y and @p x as gated by @p t: diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 55f105a96..91fc29da2 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1369,6 +1369,64 @@ struct LayerNormalizationOp : public NaryNodeOp { float eps_; }; +// RMS norm along last axis +struct RMSNormalizationOp : public NaryNodeOp { +public: + RMSNormalizationOp(const std::vector& nodes, float eps = 1e-9) + : NaryNodeOp(nodes), eps_(eps) { + // @TODO: dimension check + } + + NodeOps forwardOps() override { + return {NodeOp( + RMSNormalization(val_, + child(0)->val(), + child(1)->val(), + (children_.size() == 3) ? child(2)->val() : nullptr, + eps_))}; + } + + // @BUGBUG: backward has not been tested for broadcasting gamma/beta + NodeOps backwardOps() override { + return {NodeOp( + RMSNormalizationGrad( + graph()->allocator(), + child(0)->grad(), + child(1)->grad(), + (children_.size() == 3) ? child(2)->grad() : nullptr, + adj_, + val_, + child(0)->val(), + child(1)->val(), + (children_.size() == 3) ? child(2)->val() : nullptr, + eps_))}; + } + + const std::string type() override { return "rms_normalization"; } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, eps_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(eps_ != cnode->eps_) + return false; + return true; + } + +private: + friend class SerializationHelpers; // @TODO: use the same name for this as SqrtNodeOp + float eps_; +}; + + struct HighwayNodeOp : public NaryNodeOp { HighwayNodeOp(const std::vector& nodes) : NaryNodeOp(nodes) {} diff --git a/src/layers/generic.h b/src/layers/generic.h index 5eb936151..2746bc854 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -212,4 +212,10 @@ static inline Expr layerNorm(Expr x, std::string prefix, std::string suffix = st return marian::layerNorm(x, scale, bias, 1e-6f); } +static inline Expr rmsNorm(Expr x, std::string prefix, std::string suffix = std::string()) { + int dimModel = x->shape()[-1]; + auto scale = x->graph()->param(prefix + "_rms_scale" + suffix, {1, dimModel}, inits::ones()); + return marian::rmsNorm(x, scale, nullptr, 1e-6f); +} + } // namespace marian diff --git a/src/models/transformer.h b/src/models/transformer.h index 79b59000a..1da02318e 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -176,6 +176,8 @@ class Transformer : public EncoderOrDecoderBase { // layer normalization else if (op == 'n') output = layerNorm(output, prefix, "_pre"); + else if (op == 'r') + output = rmsNorm(output, prefix, "_pre"); else ABORT("Unknown pre-processing operation '{}'", op); } @@ -201,6 +203,8 @@ class Transformer : public EncoderOrDecoderBase { // layer normalization else if(op == 'n') output = layerNorm(output, prefix); + else if(op == 'r') + output = rmsNorm(output, prefix); else ABORT("Unknown pre-processing operation '{}'", op); } diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1191a2bec..67d993fcb 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -977,7 +977,7 @@ float L2Norm(Tensor in, Ptr /*not used*/) { float sum = 0.f; size_t size = in->size(); const float* data = in->data(); -#pragma omp parallel for simd reduction(+ : sum) + #pragma omp parallel for simd reduction(+ : sum) for(size_t i = 0; i < size; ++i) { sum += data[i] * data[i]; } @@ -998,14 +998,14 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) { int rows = m; int cols = k; -#pragma omp parallel for + #pragma omp parallel for for(int j = 0; j < rows; ++j) { const float* vaRow = va; const float* ctxRow = ctx + (j % (b * t)) * cols; const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols; float sum = 0.f; -#pragma omp simd reduction(+ : sum) + #pragma omp simd reduction(+ : sum) for(int i = 0; i < cols; ++i) { float z = ctxRow[i] + stateRow[i]; sum += std::tanh(z) * vaRow[i]; @@ -1035,7 +1035,7 @@ void AttBack(Tensor gVa_, size_t k = context_->shape()[-1]; size_t n = context_->shape()[-2]; -#pragma omp parallel for reduction(+ : gState[:n * k], gVa[:k]) + #pragma omp parallel for reduction(+ : gState[:n * k], gVa[:k]) for(size_t j = 0; j < m; ++j) { float* gcRow = gContext + j * k; float* gsRow = gState + (j % n) * k; @@ -1045,7 +1045,7 @@ void AttBack(Tensor gVa_, float adj_j = adj[j]; -#pragma omp simd + #pragma omp simd for(size_t i = 0; i < k; ++i) { float z = cRow[i] + sRow[i]; @@ -1070,20 +1070,20 @@ void LayerNormalizationImpl(float* out, float eps, int rows, int cols) { -#pragma omp parallel for + #pragma omp parallel for for(int j = 0; j < rows; ++j) { float* so = out + j * cols; const float* sp = in + j * cols; float sum = 0.f; -#pragma omp simd reduction(+ : sum) + #pragma omp simd reduction(+ : sum) for(int i = 0; i < cols; ++i) { sum += sp[i]; } float mean = sum / cols; float sqSum = 0.f; -#pragma omp simd reduction(+ : sqSum) + #pragma omp simd reduction(+ : sqSum) for(int i = 0; i < cols; ++i) { float ex = sp[i] - mean; sqSum += ex * ex; @@ -1091,7 +1091,7 @@ void LayerNormalizationImpl(float* out, float sigma = std::sqrt(sqSum / cols + eps); -#pragma omp simd + #pragma omp simd for(int i = 0; i < cols; ++i) { float t = alpha[alphaStride * i] * ((sp[i] - mean) / sigma); if(hasBeta) @@ -1168,7 +1168,7 @@ void LayerNormalizationGrad(Tensor gradX_, size_t cols = y_->shape()[-1]; if(beta) { -#pragma omp parallel for reduction(+ : gradGamma[:cols], gradBeta[:cols]) + #pragma omp parallel for reduction(+ : gradGamma[:cols], gradBeta[:cols]) for(size_t j = 0; j < rows; ++j) { const float* xRow = x + j * cols; const float* yRow = y + j * cols; @@ -1180,7 +1180,7 @@ void LayerNormalizationGrad(Tensor gradX_, float sum_adj_x = 0.f; float sum_sqr = 0.f; -#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) + #pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) for(size_t i = 0; i < cols; ++i) { sum_x += xRow[i]; sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[betaStride * i] : 0.f)) / gamma[gammaStride * i]; @@ -1188,14 +1188,14 @@ void LayerNormalizationGrad(Tensor gradX_, } float mean = sum_x / cols; -#pragma omp simd reduction(+ : sum_sqr) + #pragma omp simd reduction(+ : sum_sqr) for(size_t i = 0; i < cols; ++i) { float ex = xRow[i] - mean; sum_sqr += ex * ex; } float sigma = std::sqrt(sum_sqr / cols + eps); -#pragma omp simd + #pragma omp simd for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; float x_hat = (yRow[i] - beta[betaStride * i]) / gamma[gammaStride * i]; @@ -1209,8 +1209,8 @@ void LayerNormalizationGrad(Tensor gradX_, gradBeta[betaStride * i] += adjRow[i]; } } - } else { -#pragma omp parallel for reduction(+ : gradGamma[:cols]) + } else { // @TODO: this code duplication is really ugly, but required for omp to work correctly? + #pragma omp parallel for reduction(+ : gradGamma[:cols]) for(size_t j = 0; j < rows; ++j) { const float* xRow = x + j * cols; const float* yRow = y + j * cols; @@ -1222,23 +1222,22 @@ void LayerNormalizationGrad(Tensor gradX_, float sum_adj_x = 0.f; float sum_sqr = 0.f; -#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) + #pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) for(size_t i = 0; i < cols; ++i) { sum_x += xRow[i]; - sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[betaStride * i] : 0.f)) / gamma[gammaStride * i]; - // @TODO: beta is NULL here ^^ + sum_adj_x += adjRow[i] * yRow[i] / gamma[gammaStride * i]; sum_adj += adjRow[i]; } float mean = sum_x / cols; -#pragma omp simd reduction(+ : sum_sqr) + #pragma omp simd reduction(+ : sum_sqr) for(size_t i = 0; i < cols; ++i) { float ex = xRow[i] - mean; sum_sqr += ex * ex; } float sigma = std::sqrt(sum_sqr / cols + eps); -#pragma omp simd + #pragma omp simd for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; float x_hat = yRow[i] / gamma[gammaStride * i]; @@ -1255,6 +1254,163 @@ void LayerNormalizationGrad(Tensor gradX_, } MARIAN_FFAST_MATH_END +MARIAN_FFAST_MATH_BEGIN +template +void RMSNormalizationImpl(float* out, + const float* in, + const float* alpha, + const float* beta, + float eps, + int rows, + int cols) { + #pragma omp parallel for + for(int j = 0; j < rows; ++j) { + float* so = out + j * cols; + const float* sp = in + j * cols; + + float sqSum = 0.f; + #pragma omp simd reduction(+ : sqSum) + for(int i = 0; i < cols; ++i) { + sqSum += sp[i] * sp[i]; + } + + float rms = std::sqrt(sqSum / cols + eps); + + #pragma omp simd + for(int i = 0; i < cols; ++i) { + float t = alpha[alphaStride * i] * (sp[i] / rms); + if(hasBeta) + t += beta[betaStride * i]; + + so[i] = t; + } + } +} +MARIAN_FFAST_MATH_END + +template +inline void RMSNormalizationDispatchBeta(float* out, + const float* in, + const float* alpha, + Tensor beta, + float eps, + int rows, + int cols) { + if (beta) { + if (beta->shape().back() > 1) { + RMSNormalizationImpl(out, in, alpha, beta->data(), eps, rows, cols); + } else { + RMSNormalizationImpl(out, in, alpha, beta->data(), eps, rows, cols); + } + } else { + RMSNormalizationImpl(out, in, alpha, nullptr, eps, rows, cols); + } +} + +void RMSNormalization(Tensor out, + Tensor in, + Tensor gamma, + Tensor beta, + float eps) { + const float* alpha = gamma->data(); + const int alphaStride = gamma->shape().back() > 1; // broadcasting for alpha and beta + + int rows = in->shape().elements() / in->shape().back(); + int cols = in->shape().back(); + if (alphaStride == 0) { + RMSNormalizationDispatchBeta<0>(out->data(), in->data(), alpha, beta, eps, rows, cols); + } else { + RMSNormalizationDispatchBeta<1>(out->data(), in->data(), alpha, beta, eps, rows, cols); + } +} + +MARIAN_FFAST_MATH_BEGIN +void RMSNormalizationGrad(Tensor gradX_, + Tensor gradGamma_, + Tensor gradBeta_, + Tensor adj_, + Tensor y_, + Tensor x_, + Tensor gamma_, + Tensor beta_, + float eps) { + float* gradX = gradX_->data(); + float* gradGamma = gradGamma_->data(); + float* gradBeta = gradBeta_ ? gradBeta_->data() : nullptr; + float* adj = adj_->data(); + float* x = x_->data(); + float* y = y_->data(); + float* gamma = gamma_->data(); + float* beta = beta_ ? beta_->data() : nullptr; + // @TODO: The CPU implementation supports scalar gamma and beta. This is a left-over, + // we should enable that in the GPU version as well. + const int gammaStride = gamma_->shape().back() > 1; // broadcasting for alpha and beta. 0 means it's a scalar + const int betaStride = beta_ && beta_->shape().back() > 1; + + size_t rows = y_->shape().elements() / y_->shape()[-1]; + size_t cols = y_->shape()[-1]; + + if(beta) { + #pragma omp parallel for reduction(+ : gradGamma[:cols], gradBeta[:cols]) + for(size_t j = 0; j < rows; ++j) { + const float* xRow = x + j * cols; + const float* yRow = y + j * cols; + const float* adjRow = adj + j * cols; + float* gradXRow = gradX + j * cols; + + float sum_adj_r = 0.f; + float sum_sqr = 0.f; + + #pragma omp simd reduction(+ : sum_adj_r, sum_sqr) + for(size_t i = 0; i < cols; ++i) { + sum_adj_r += adjRow[i] * (yRow[i] - beta[betaStride * i]) / gamma[gammaStride * i]; + sum_sqr += xRow[i] * xRow[i]; + } + + float rms = std::sqrt(sum_sqr / cols + eps); + #pragma omp simd + for(size_t i = 0; i < cols; ++i) { + float rmsNorm = (yRow[i] - beta[betaStride * i]) / gamma[gammaStride * i]; + float gradNorm = cols * adjRow[i] - rmsNorm * sum_adj_r; + gradNorm /= cols * rms; + + gradXRow[i] += gamma[gammaStride * i] * gradNorm; + gradGamma[gammaStride * i] += adjRow[i] * rmsNorm; + gradBeta[betaStride * i] += adjRow[i]; + } + } + } else { + #pragma omp parallel for reduction(+ : gradGamma[:cols]) + for(size_t j = 0; j < rows; ++j) { + const float* xRow = x + j * cols; + const float* yRow = y + j * cols; + const float* adjRow = adj + j * cols; + float* gradXRow = gradX + j * cols; + + float sum_adj_r = 0.f; + float sum_sqr = 0.f; + + #pragma omp simd reduction(+ : sum_adj_r, sum_sqr) + for(size_t i = 0; i < cols; ++i) { + sum_adj_r += yRow[i] / gamma[gammaStride * i]; + sum_sqr += xRow[i] * xRow[i]; + } + + float rms = std::sqrt(sum_sqr / cols + eps); + #pragma omp simd + for(size_t i = 0; i < cols; ++i) { + float rmsNorm = yRow[i] / gamma[gammaStride * i]; + float gradNorm = cols * adjRow[i] - rmsNorm * sum_adj_r; + gradNorm /= cols * rms; + + gradXRow[i] += gamma[gammaStride * i] * gradNorm; + gradGamma[gammaStride * i] += adjRow[i] * rmsNorm; + } + } + } +} +MARIAN_FFAST_MATH_END + void Shift(Tensor out_, Tensor in_, marian::Shape shift, diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 97f0cdfe0..d55214bc7 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -2303,6 +2303,273 @@ void LayerNormalizationGrad(Ptr allocator, allocator->free(tempOnesMemory); } +template +__global__ void gRMSNormalization(T* out, + const T* in, + const T* gamma, + const T* beta, + int rows, + int cols, + AccType eps = 1e-9) { + extern __shared__ uint8_t _sharedBytes[]; + AccType* _shareAccType = (AccType*)_sharedBytes; + + AccType N = cols; + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if(j < rows) { + T* yRow = out + j * cols; + const T* xRow = in + j * cols; + + AccType* _sqSum = _shareAccType; + + _sqSum[threadIdx.x] = (AccType)0.0f; + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + AccType xv = (AccType)xRow[id]; + _sqSum[threadIdx.x] += xv * xv; + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) + _sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip]; + len = (len + 1) >> 1; + } + __syncthreads(); + AccType rms = functional::Ops::sqrt(_sqSum[0] / N + eps); // all AccType + __syncthreads(); + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + AccType gammav = (AccType)gamma[id]; + AccType xv = (AccType)xRow[id]; + AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; + AccType rmsNorm = xv / rms; + AccType y = gammav * rmsNorm + betav; + yRow[id] = (T)y; + } + } + } + __syncthreads(); + } +} + +void RMSNormalization(Tensor out, + Tensor in, + Tensor gamma, + Tensor beta, + float eps) { + cudaSetDevice(out->getDeviceId().no); + + int rows = in->shape().elements() / in->shape().back(); + int cols = in->shape().back(); + + int blocks = std::min(MAX_BLOCKS, (int)rows); + int threads = std::min(MAX_THREADS, (int)cols); + int shared = threads * sizeof(float); + + if(out->type() == Type::float32) { + gRMSNormalization<<>>(out->data(), + in->data(), + gamma->data(), + beta ? beta->data() : nullptr, + rows, + cols, + eps); +#if COMPILE_FP16 + } else if (out->type() == Type::float16) { + gRMSNormalization<<>>(out->data(), + in->data(), + gamma->data(), + beta ? beta->data() : nullptr, + rows, + cols, + eps); +#endif + } else { + ABORT("RMSNormalization not implemented for type {}", out->type()); + } +} + +template +__global__ void gRMSNormalizationGrad(T* gradX, + T* gradGamma, + T* adj, + T* y, + T* x, + T* gamma, + T* beta, + int rows, + int cols, + AccType eps = 1e-9) { + extern __shared__ uint8_t sharedBytes[]; + AccType* shared = (AccType*)sharedBytes; + + AccType N = cols; + + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if(j < rows) { + AccType* sum_adj_r = shared; // sum of gradient coming in times layerNorm from value + AccType* sum_sqr = shared + blockDim.x; // sum of x^2 + + const T* xRow = x + j * cols; + const T* yRow = y + j * cols; + const T* adjRow = adj + j * cols; + + sum_adj_r[threadIdx.x] = (AccType)0.0f; + sum_sqr[threadIdx.x] = (AccType)0.0f; + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + AccType xv = xRow[id]; + AccType yv = yRow[id]; + AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; + AccType gammav = (AccType)gamma[id]; + AccType adjv = adjRow[id]; + AccType rv = (yv - betav) / gammav; // go back to RMSNorm(x) from scaled and shifted version for accumulation + + sum_adj_r[threadIdx.x] += adjv * rv; + sum_sqr[threadIdx.x] += xv * xv; + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) { + sum_adj_r[threadIdx.x] += sum_adj_r[threadIdx.x + skip]; // Accumulates in AccType + sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip]; // Accumulates in AccType + } + len = (len + 1) >> 1; + } + + __syncthreads(); + AccType rms = functional::Ops::sqrt(sum_sqr[0] / N + eps); + __syncthreads(); + + // Jacobian of RMS norm + // J = [ \frac{1}{N * rms} (N\delta_{ij} - RN_i RN_j) ]_{ij} + // J * a = dC/dx_i = ( N a_i - RN_i \sum_j RN_j a_j ) / (N * rms) + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + + AccType xv = xRow[id]; + AccType gammav = (AccType)gamma[id]; + AccType adjv = adjRow[id]; + AccType rmsNorm = xv / rms; + + AccType gradNorm = N * adjv - rmsNorm * sum_adj_r[0]; + gradNorm /= N * rms; + + AccType gradXv = gammav * gradNorm; + + // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. + // @TODO: to be fixed and removed. + AccType sign = functional::Ops::sgn(gradXv); + AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete. + gradXv = functional::Ops::abs(gradXv) > cutoff ? sign * cutoff : gradXv; // if gradXv is NaN the value return is NaN too because NaN > value is false. + + // @TODO: frankly, this is embarrasing and should rather be removed or optional? It does help for low precision computation though. Maybe turn into option? + gradXv = isnan(gradXv) ? 0.f : gradXv; // turn NaN into 0. + + T* gradXRow = gradX + j * cols; + gradXRow[id] += (T)(gradXv); + + T* gradGammaRow = gradGamma + j * cols; + // assignment is correct here as this gets summed up + // in the next kernel via matrix product + gradGammaRow[id] = (T)(adjv * rmsNorm); + } + } + } + __syncthreads(); + } +} + +void RMSNormalizationGrad(Ptr allocator, + Tensor gradX, + Tensor gradGamma, + Tensor gradBeta, + Tensor adj, + Tensor y, + Tensor x, + Tensor gamma, + Tensor beta, + float eps) { + cudaSetDevice(adj->getDeviceId().no); + int rows = y->shape().elements() / y->shape()[-1]; + int cols = y->shape()[-1]; + + int threads = std::min(MAX_THREADS, cols); + int blocks = std::min(MAX_BLOCKS, rows); + + auto tempGradGammaMemory = allocator->alloc(adj->memory()->size()); + Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend()); + tempGradGamma->set(0.f); + + auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type())); + Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend()); + tempOnes->set(1.f); + + if(gradX->type() == Type::float32) { + int shared = sizeof(float) * threads * 2; + gRMSNormalizationGrad<<>>( + gradX->data(), + tempGradGamma->data(), + adj->data(), + y->data(), + x->data(), + gamma->data(), + (beta) ? beta->data() : nullptr, + rows, + cols, + eps); +#if COMPILE_FP16 + } else if (gradX->type() == Type::float16) { + // accumulate in float + int shared = sizeof(float) * threads * 2; + gRMSNormalizationGrad<<>>( + gradX->data(), + tempGradGamma->data(), + adj->data(), + y->data(), + x->data(), + gamma->data(), + (beta) ? beta->data() : nullptr, + rows, + cols, + eps); +#endif + } else { + ABORT("RMSNormalizationGrad not implemented for type {}", gradX->type()); + } + + // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. + // This is much faster for fp16 which seems to have a broken atomicAdd implementation. + // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. + // This preserves precision with larger batches where all batch entries reduce into a single vector. + // See also AffineNodeOp where we do the same for biases + gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add + + if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction) + gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add + + allocator->free(tempGradGammaMemory); + allocator->free(tempOnesMemory); +} + + template __global__ void gShift(T* out, const T* in, diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index af7946dde..ef4850683 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -218,6 +218,55 @@ static inline void LayerNormalizationGrad( cpu::LayerNormalizationGrad(gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); } +// clang-format off +DISPATCH5(RMSNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) + +#ifdef CUDA_FOUND +namespace gpu { +void RMSNormalizationGrad(Ptr allocator, + Tensor gradX, + Tensor gradGamma, + Tensor gradBeta, + Tensor adj, + Tensor y, + Tensor x, + Tensor gamma, + Tensor beta, + float eps); +} +#endif + +namespace cpu { +void RMSNormalizationGrad(Tensor gradX, + Tensor gradGamma, + Tensor gradBeta, + Tensor adj, + Tensor y, + Tensor x, + Tensor gamma, + Tensor beta, + float eps); +} + +static inline void RMSNormalizationGrad( + Ptr allocator, + Tensor gradX, + Tensor gradGamma, + Tensor gradBeta, + Tensor adj, + Tensor y, + Tensor x, + Tensor gamma, + Tensor beta, + float eps) { +#ifdef CUDA_FOUND + if(gradX->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::RMSNormalizationGrad(allocator, gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); + else +#endif + cpu::RMSNormalizationGrad(gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); +} + DISPATCH4(HighwayForward, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH7(HighwayBackward, marian::Tensor, marian::Tensor, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index c3fd4a9e7..1a18da999 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -300,6 +300,49 @@ void tests(DeviceType device, Type floatType = Type::float32) { } + SECTION("RMS normalization") { + graph->clear(); + values.clear(); + + std::vector init = { + 2.88794374, 4.67853451, 3.96257305, 3.28433037, + 0.37778997, 0.67662024, 4.24959183, 1.23910618, + 0.68929380, 2.00369596, 4.38251686, 1.75624943, + 4.96126175, 3.01947117, 4.72057724, 2.23017120 + }; + + auto a1 = graph->param("test1", {2, 2, 4}, inits::fromVector(init)); + auto a2 = graph->param("test2", {2, 2, 4}, inits::fromVector(init)); + auto gamma = graph->param("gamma", {1, 4}, inits::ones()); + + auto rms = rmsNorm(a1, gamma, nullptr, 1e-5f); + auto rms2 = gamma * (a2 / sqrt(mean(a2 * a2, /*axis=*/-1) + 1e-5f)); + + auto top = sum(flatten(rms + rms2)); + + graph->forward(); + graph->backward(); + + CHECK(rms->shape() == Shape({2, 2, 4})); + + std::vector values2; + + // compare values of rms and rms2 to make sure forward computation is correct + rms->val()->get(values); + rms2->val()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + values2.begin(), floatApprox) ); + + // compare adjoints of a1 and a2 (parameters) to makes sure gradient computation is correct + a1->grad()->get(values); + a2->grad()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + values2.begin(), floatApprox) ); + + } + SECTION("reductions") { graph->clear(); values.clear(); From a7c3a0b2ef0d3a9ffd952bb1eb97e680633d1515 Mon Sep 17 00:00:00 2001 From: huangjq0617 Date: Sun, 11 Apr 2021 12:28:04 +0800 Subject: [PATCH 011/254] fix beam_search ABORT when enable openmp and OMP_NUM_THREADS > 1 (#767) --- CHANGELOG.md | 1 + src/tensors/cpu/tensor_operators.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9182057c4..526d93d3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Added a few missing header files in shortlist.h and beam_search.h. - Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately. - Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t. +- fix beam_search ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); when enable openmp and OMP_NUM_THREADS > 1 ### Changed - Remove `--clip-gemm` which is obsolete and was never used anyway diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1191a2bec..d92adba80 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -261,12 +261,12 @@ void TransposeFirst3In4(Tensor out, Tensor in, const std::vector& vAxis) { // find the mapping between the transposed output dimensional indices (oi, oj, ok) // and original input dimensional indices (i, j, k) - int oi, oj, ok; #pragma omp parallel for for(int k = 0; k < l1; ++k) { int shift = k * l2 * l3; for(int j = 0; j < l2; ++j) { for(int i = 0; i < l3; ++i) { + int oi, oj, ok; if(vAxis[0] == 0) { if(vAxis[1] == 1) { oi = i; oj = j; ok = k; From 8a53b761d5bc922e4ab058a4487ad362d2edefaf Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 11 Apr 2021 04:30:35 +0000 Subject: [PATCH 012/254] update version --- VERSION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 7af87e875..b609d445b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1,2 @@ -v1.10.17 +v1.10.18 + From bb6092da2b89c3882aa256480cef13081fd6f50f Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 14 Apr 2021 16:48:51 +0100 Subject: [PATCH 013/254] Compute tensor size using integers (#851) --- CHANGELOG.md | 1 + src/tensors/allocator.h | 5 +++-- src/tensors/device.h | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbab6b5af..c8aed7e16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Dynamic gradient-scaling with `--dynamic-gradient-scaling`. - Add unit tests for binary files. - Fix compilation with OMP +- Compute aligned memory sizes using exact sizing ### Fixed - Fixed an issue when loading intgemm16 models from unaligned memory. diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h index 9dc44f58e..1844be142 100644 --- a/src/tensors/allocator.h +++ b/src/tensors/allocator.h @@ -175,8 +175,9 @@ class Allocator { reserve(bytes); } - size_t alignedSize(size_t size) { - return (size_t)(ceil(size / (double)alignment_) * alignment_); + size_t alignedSize(size_t size) const { + size_t over = size + alignment_ - 1; + return over - (over % alignment_); } void throwAtReallocation(bool throwRealloc) { throw_ = throwRealloc; } diff --git a/src/tensors/device.h b/src/tensors/device.h index 0be6c076c..5fe3c1fb2 100644 --- a/src/tensors/device.h +++ b/src/tensors/device.h @@ -15,8 +15,9 @@ class Device { size_t size_{0}; size_t alignment_; - size_t align(size_t size) { - return (size_t)(ceil(size / (float)alignment_) * alignment_); + size_t align(size_t size) const { + size_t over = size + alignment_ - 1; + return over - (over % alignment_); } public: From 3e51ff387232f1096e9560980f0115ac734224f5 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 20 Apr 2021 15:50:53 +0000 Subject: [PATCH 014/254] fix depth-scaling in FFN --- CHANGELOG.md | 1 + src/layers/generic.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbab6b5af..752847e1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fix compilation with OMP ### Fixed +- Missing depth-scaling in transformer FFN - Fixed an issue when loading intgemm16 models from unaligned memory. - Fix building marian with gcc 9.3+ and FBGEMM - Find MKL installed under Ubuntu 20.04 via apt-get diff --git a/src/layers/generic.h b/src/layers/generic.h index 2746bc854..8f390bd7d 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -192,7 +192,7 @@ static inline Expr denseInline(Expr x, float dropProb = 0.0f) { auto graph = x->graph(); - auto W = graph->param(prefix + "_W" + suffix, {x->shape()[-1], outDim}, inits::glorotUniform()); + auto W = graph->param(prefix + "_W" + suffix, {x->shape()[-1], outDim}, initFn); auto b = graph->param(prefix + "_b" + suffix, {1, outDim}, inits::zeros()); if(actName == "relu") { From 1c8ee95a544788deea2b5eaa217d9e864b606204 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 21 Apr 2021 05:14:36 +0000 Subject: [PATCH 015/254] update version --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index b609d445b..de0d73ce0 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.18 +v1.10.19 From 49e379bba5c77c1b80927b7f0db5603e171a1903 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 26 Apr 2021 11:51:43 +0000 Subject: [PATCH 016/254] Merged PR 18612: Early stopping on first, all, or any validation metrics Adds `--early-stopping-on first|all|any` allowing to decide if early stopping should take into account only first, all, or any validation metrics. Feature request: https://github.com/marian-nmt/marian-dev/issues/850 Regression tests: https://github.com/marian-nmt/marian-regression-tests/pull/79 --- CHANGELOG.md | 1 + regression-tests | 2 +- src/common/config_parser.cpp | 14 ++++--- src/common/config_validator.cpp | 9 ++++- src/training/scheduler.h | 69 ++++++++++++++++++++++----------- src/training/training_state.h | 4 -- src/training/validator.cpp | 14 +++---- src/training/validator.h | 2 +- 8 files changed, 74 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 752847e1f..7f41b8d12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Early stopping based on first, all, or any validation metrics via `--early-stopping-on` - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. - Extend suppression of unwanted output symbols, specifically "\n" from default vocabulary if generated by SentencePiece with byte-fallback. Deactivates with --allow-special - Allow for fine-grained CPU intrinsics overrides when BUILD_ARCH != native e.g. -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off diff --git a/regression-tests b/regression-tests index 7d612ca5e..1afd4eb10 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0 +Subproject commit 1afd4eb1014ac451c6a3d6f9b5d34c322902e624 diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 6495db0e7..f29b36307 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -244,7 +244,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Tie all embedding layers and output layer"); cli.add("--output-omit-bias", "Do not use a bias vector in decoder output layer"); - + // Transformer options cli.add("--transformer-heads", "Number of heads in multi-head attention (transformer)", @@ -529,13 +529,13 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", 100); - cli.add>("--dynamic-gradient-scaling", + cli.add>("--dynamic-gradient-scaling", "Re-scale gradient to have average gradient norm if (log) gradient norm diverges from average by arg1 sigmas. " "If arg2 = \"log\" the statistics are recorded for the log of the gradient norm else use plain norm") ->implicit_val("2.f log"); - cli.add("--check-gradient-nan", + cli.add("--check-gradient-nan", "Skip parameter update in case of NaNs in gradient"); - cli.add("--normalize-gradient", + cli.add("--normalize-gradient", "Normalize gradient by multiplying with no. devices / total labels (not recommended and to be removed in the future)"); cli.add>("--train-embedder-rank", @@ -574,6 +574,10 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { cli.add("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); + cli.add("--early-stopping-on", + "Decide if early stopping should take into account first, all, or any validation metrics" + "Possible values: first, all, any", + "first"); // decoding options cli.add("--beam-size,-b", @@ -586,7 +590,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { "Maximum target length as source length times factor", 3); cli.add("--word-penalty", - "Subtract (arg * translation length) from translation score "); + "Subtract (arg * translation length) from translation score"); cli.add("--allow-unk", "Allow unknown words to appear in output"); cli.add("--n-best", diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index b24001450..fea7578f3 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -4,6 +4,8 @@ #include "common/utils.h" #include "common/filesystem.h" +#include + namespace marian { bool ConfigValidator::has(const std::string& key) const { @@ -129,6 +131,11 @@ void ConfigValidator::validateOptionsTraining() const { && !get>("valid-sets").empty(), errorMsg); + // check if --early-stopping-on has proper value + std::set supportedStops = {"first", "all", "any"}; + ABORT_IF(supportedStops.find(get("early-stopping-on")) == supportedStops.end(), + "Supported options for --early-stopping-on are: first, all, any"); + // validations for learning rate decaying ABORT_IF(get("lr-decay") > 1.f, "Learning rate decay factor greater than 1.0 is unusual"); @@ -145,7 +152,7 @@ void ConfigValidator::validateOptionsTraining() const { // validate ULR options ABORT_IF((has("ulr") && get("ulr") && (get("ulr-query-vectors") == "" || get("ulr-keys-vectors") == "")), - "ULR enablign requires query and keys vectors specified with --ulr-query-vectors and " + "ULR requires query and keys vectors specified with --ulr-query-vectors and " "--ulr-keys-vectors option"); // validate model quantization diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 9d2500f92..8d4fa30ca 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -28,7 +28,7 @@ class Scheduler : public TrainingObserver { // (regardless if it's the 1st or nth epoch and if it's a new or continued training), // which indicates the end of the training data stream from STDIN bool endOfStdin_{false}; // true at the end of the epoch if training from STDIN; - + // @TODO: figure out how to compute this with regard to updates as well, although maybe harder since no final value // determine scheduled LR decay factor (--lr-decay-inv-sqrt option) float getScheduledLRDecayFactor(const TrainingState& state) const { @@ -133,7 +133,7 @@ class Scheduler : public TrainingObserver { Scheduler(Ptr options, Ptr state, Ptr mpi = nullptr) : options_(options), state_(state), mpi_(mpi), gradientNormAvgWindow_(options_->get("gradient-norm-average-window", 100)) { - + // parse logical-epoch parameters auto logicalEpochStr = options->get>("logical-epoch", {"1e", "0"}); ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?"); @@ -174,7 +174,7 @@ class Scheduler : public TrainingObserver { size_t progress = state_->getProgressIn(mbWarmup.unit); // number of updates/labels processed auto progressRatio = (double)progress / (double)mbWarmup.n; // where are we relatively within target warm-up period // if unit is labels, then account for the fact that our increment itself is not constant -#if 1 // this seems to hurt convergence quite a bit compared to when updates is used +#if 1 // this seems to hurt convergence quite a bit compared to when updates is used if (mbWarmup.unit == SchedulingUnit::trgLabels) progressRatio = std::sqrt(progressRatio); #endif @@ -207,7 +207,7 @@ class Scheduler : public TrainingObserver { if(saveAndExitRequested()) // via SIGTERM return false; -#if 1 // @TODO: to be removed once we deprecate after-epochs and after-batches +#if 1 // @TODO: to be removed once we deprecate after-epochs and after-batches // stop if it reached the maximum number of epochs size_t stopAfterEpochs = options_->get("after-epochs"); if(stopAfterEpochs > 0 && calculateLogicalEpoch() > stopAfterEpochs) @@ -231,10 +231,9 @@ class Scheduler : public TrainingObserver { } } - // stop if the first validator did not improve for a given number of checks + // stop if the first/all/any validators did not improve for a given number of checks size_t stopAfterStalled = options_->get("early-stopping"); - if(stopAfterStalled > 0 && !validators_.empty() - && stalled() >= stopAfterStalled) + if(stopAfterStalled > 0 && stalled() >= stopAfterStalled) return false; // stop if data streaming from STDIN is stopped @@ -297,12 +296,11 @@ class Scheduler : public TrainingObserver { || (!state_->enteredNewPeriodOf(options_->get("valid-freq")) && !isFinal)) // not now return; - bool firstValidator = true; + size_t stalledPrev = stalled(); for(auto validator : validators_) { if(!validator) continue; - size_t stalledPrev = validator->stalled(); float value = 0; if(!mpi_ || mpi_->isMainProcess()) { // We run validation only in the main process, but this is risky with MPI. @@ -330,34 +328,60 @@ class Scheduler : public TrainingObserver { if(mpi_) { // collect and broadcast validation result to all processes and bring validator up-to-date mpi_->bCast(&value, 1, IMPIWrapper::getDataType(&value)); - + // @TODO: add function to validator? mpi_->bCast(&validator->stalled(), 1, IMPIWrapper::getDataType(&validator->stalled())); mpi_->bCast(&validator->lastBest(), 1, IMPIWrapper::getDataType(&validator->lastBest())); } - if(firstValidator) - state_->validBest = value; - state_->validators[validator->type()]["last-best"] = validator->lastBest(); state_->validators[validator->type()]["stalled"] = validator->stalled(); - - // notify training observers if the first validator did not improve - if(firstValidator && validator->stalled() > stalledPrev) - state_->newStalled(validator->stalled()); - firstValidator = false; } + // notify training observers about stalled validation + size_t stalledNew = stalled(); + if(stalledNew > stalledPrev) + state_->newStalled(stalledNew); + state_->validated = true; } + // Returns the proper number of stalled validation w.r.t. early-stopping-on size_t stalled() { + std::string stopOn = options_->get("early-stopping-on"); + if(stopOn == "any") + return stalledMax(); + if(stopOn == "all") + return stalledMin(); + return stalled1st(); + } + + // Returns the number of stalled validations for the first validator + size_t stalled1st() { if(!validators_.empty()) if(validators_[0]) return validators_[0]->stalled(); return 0; } + // Returns the largest number of stalled validations across validators or 0 if there are no validators + size_t stalledMax() { + size_t max = 0; + for(auto validator : validators_) + if(validator && validator->stalled() > max) + max = validator->stalled(); + return max; + } + + // Returns the lowest number of stalled validations across validators or 0 if there are no validators + size_t stalledMin() { + size_t min = std::numeric_limits::max(); + for(auto validator : validators_) + if(validator && validator->stalled() < min) + min = validator->stalled(); + return min == std::numeric_limits::max() ? 0 : min; + } + void update(StaticLoss rationalLoss, Ptr batch) { update(rationalLoss, /*numReadBatches=*/1, /*batchSize=*/batch->size(), /*batchLabels=*/batch->wordsTrg(), /*gradientNorm=*/0.f); } @@ -397,8 +421,8 @@ class Scheduler : public TrainingObserver { if(gradientNorm) { size_t range = std::min(gradientNormAvgWindow_, state_->batches); - float alpha = 2.f / (float)(range + 1); - + float alpha = 2.f / (float)(range + 1); + float delta = gradientNorm - state_->gradientNormAvg; state_->gradientNormAvg = state_->gradientNormAvg + alpha * delta; state_->gradientNormVar = (1.0f - alpha) * (state_->gradientNormVar + alpha * delta * delta); @@ -440,7 +464,7 @@ class Scheduler : public TrainingObserver { formatLogicalEpoch(), state_->batches, utils::withCommas(state_->samplesEpoch), - formatLoss(lossType, dispLabelCounts, batchLabels, state_), + formatLoss(lossType, dispLabelCounts, batchLabels, state_), timer_.elapsed(), state_->wordsDisp / timer_.elapsed(), state_->gradientNormAvg); @@ -627,7 +651,8 @@ class Scheduler : public TrainingObserver { if(options_->get("lr-decay-repeat-warmup")) { LOG(info, "Restarting learning rate warmup"); - state.warmupStart.n = state.getProgressIn(SchedulingParameter::parse(options_->get("lr-warmup")).unit); + state.warmupStart.n = state.getProgressIn( + SchedulingParameter::parse(options_->get("lr-warmup")).unit); } } } diff --git a/src/training/training_state.h b/src/training/training_state.h index 7d62f060d..e0c1ba5df 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -43,8 +43,6 @@ class TrainingState { size_t stalled{0}; // The largest number of stalled validations so far size_t maxStalled{0}; - // Last best validation score - float validBest{0.f}; std::string validator; // List of validators YAML::Node validators; @@ -217,7 +215,6 @@ class TrainingState { stalled = config["stalled"].as(); maxStalled = config["stalled-max"].as(); - validBest = config["valid-best"].as(); validator = config["validator"].as(); validators = config["validators"]; reset = config["reset"].as(); @@ -259,7 +256,6 @@ class TrainingState { config["stalled"] = stalled; config["stalled-max"] = maxStalled; - config["valid-best"] = validBest; config["validator"] = validator; config["validators"] = validators; config["reset"] = reset; diff --git a/src/training/validator.cpp b/src/training/validator.cpp index d824052f0..ef1bac3db 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -447,7 +447,7 @@ SacreBleuValidator::SacreBleuValidator(std::vector> vocabs, Ptr& stats, LOG_VALID_ONCE(info, "First sentence's tokens as scored:"); LOG_VALID_ONCE(info, " Hyp: {}", utils::join(decode(cand, /*addEOS=*/false))); LOG_VALID_ONCE(info, " Ref: {}", utils::join(decode(ref, /*addEOS=*/false))); - + if(useWordIds_) updateStats(stats, cand, ref); else updateStats(stats, decode(cand, /*addEOS=*/false), decode(ref, /*addEOS=*/false)); - + } // Re-implementation of BLEU metric from SacreBLEU @@ -627,7 +627,7 @@ float SacreBleuValidator::calcBLEU(const std::vector& stats) { for(int i = 0; i < order_; ++i) { float commonNgrams = stats[statsPerOrder * i + 0]; float hypothesesNgrams = stats[statsPerOrder * i + 1]; - + if(commonNgrams == 0.f) return 0.f; logbleu += std::log(commonNgrams) - std::log(hypothesesNgrams); @@ -653,7 +653,7 @@ float SacreBleuValidator::calcChrF(const std::vector& stats) { float commonNgrams = stats[statsPerOrder * i + 0]; float hypothesesNgrams = stats[statsPerOrder * i + 1]; float referencesNgrams = stats[statsPerOrder * i + 2]; - + if(hypothesesNgrams > 0 && referencesNgrams > 0) { avgPrecision += commonNgrams / hypothesesNgrams; avgRecall += commonNgrams / referencesNgrams; @@ -666,10 +666,10 @@ float SacreBleuValidator::calcChrF(const std::vector& stats) { avgPrecision /= effectiveOrder; avgRecall /= effectiveOrder; - + if(avgPrecision + avgRecall == 0.f) return 0.f; - + auto betaSquare = beta * beta; auto score = (1.f + betaSquare) * (avgPrecision * avgRecall) / ((betaSquare * avgPrecision) + avgRecall); return score * 100.f; // we multiply by 100 which is usually not done for ChrF, but this makes it more comparable to BLEU diff --git a/src/training/validator.h b/src/training/validator.h index d6e64d69a..16bfd2457 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -352,7 +352,7 @@ class SacreBleuValidator : public Validator { private: const std::string metric_; // allowed values are: bleu, bleu-detok (same as bleu), bleu-segmented, chrf bool computeChrF_{ false }; // should we compute ChrF instead of BLEU (BLEU by default)? - + size_t order_{ 4 }; // 4-grams for BLEU by default static const size_t statsPerOrder = 3; // 0: common ngrams, 1: candidate ngrams, 2: reference ngrams bool useWordIds_{ false }; // compute BLEU score by matching numeric segment ids From 36b4b69d7bbbe5e58cef4499011bef29feebf8b3 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 28 Apr 2021 13:28:50 +0100 Subject: [PATCH 017/254] Remove unused memoized_ variable (#852) --- CHANGELOG.md | 1 + src/graph/expression_graph.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dafb1ae2d..b03a07060 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Developer documentation framework based on Sphinx+Doxygen+Breathe+Exhale - Expresion graph documentation (#788) - Graph operators documentation (#801) +- Remove unused variable from expression graph ## [1.10.0] - 2021-02-06 diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index adc0aeae9..fce7d532f 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -145,8 +145,6 @@ class ExpressionGraph : public std::enable_shared_from_this { Ptr tensors_; private: - std::unordered_map> memoized_; - Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type bool inferenceOnly_{false}; // a flag holds whether the graph is used for inference only From 909df372d10803395684a60d6d6fe0cb7de83637 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 28 Apr 2021 23:40:00 -0700 Subject: [PATCH 018/254] restart --- src/CMakeLists.txt | 1 - src/data/shortlist.cpp | 43 ++++++++++++++ src/data/shortlist.h | 29 ++++----- src/layers/generic.cpp | 1 - src/layers/lsh.cpp | 130 ----------------------------------------- src/layers/lsh.h | 31 ---------- src/layers/output.cpp | 14 ----- src/layers/output.h | 2 - 8 files changed, 59 insertions(+), 192 deletions(-) delete mode 100644 src/layers/lsh.cpp delete mode 100644 src/layers/lsh.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cf276137d..d2fd269f8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,7 +72,6 @@ set(MARIAN_SOURCES layers/generic.cpp layers/loss.cpp layers/weight.cpp - layers/lsh.cpp layers/embedding.cpp layers/output.cpp layers/logits.cpp diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 6f551262d..67317f4be 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -1,5 +1,6 @@ #include "data/shortlist.h" #include "microsoft/shortlist/utils/ParameterTree.h" +#include "marian.h" namespace marian { namespace data { @@ -12,6 +13,48 @@ const T* get(const void*& current, size_t num = 1) { return ptr; } +////////////////////////////////////////////////////////////////////////////////////// +Shortlist::Shortlist(const std::vector& indices) + : indices_(indices) {} + +const std::vector& Shortlist::indices() const { return indices_; } +WordIndex Shortlist::reverseMap(int idx) { return indices_[idx]; } + +WordIndex Shortlist::tryForwardMap(WordIndex wIdx) { + auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); + if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx + return (int)std::distance(indices_.begin(), first); // return coordinate if found + else + return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? +} + +void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { + int k = indices_.size(); + int currBeamSize = input->shape()[0]; + int batchSize = input->shape()[2]; + std::cerr << "currBeamSize=" << currBeamSize << std::endl; + std::cerr << "batchSize=" << batchSize << std::endl; + + Expr indicesExprBC; + broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExprBC, k); +} + + +void Shortlist::broadcast(Expr weights, + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + Expr indicesExprBC, + int k) { + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indices()); + if (b) { + cachedShortb_ = index_select(b, -1, indices()); + } + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indices()); + return; + +} +////////////////////////////////////////////////////////////////////////////////////// QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, Ptr srcVocab, Ptr trgVocab, diff --git a/src/data/shortlist.h b/src/data/shortlist.h index f04676401..dd7d05894 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -19,26 +19,29 @@ namespace marian { namespace data { class Shortlist { -private: +protected: std::vector indices_; // // [packed shortlist index] -> word index, used to select columns from output embeddings + Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) + Expr cachedShortb_; // these match the current value of shortlist_ + Expr cachedShortLemmaEt_; + + virtual void broadcast(Expr weights, + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + Expr indicesExprBC, + int k); public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos - Shortlist(const std::vector& indices) - : indices_(indices) {} + Shortlist(const std::vector& indices); - const std::vector& indices() const { return indices_; } - WordIndex reverseMap(int idx) { return indices_[idx]; } - - WordIndex tryForwardMap(WordIndex wIdx) { - auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); - if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx - return (int)std::distance(indices_.begin(), first); // return coordinate if found - else - return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? - } + const std::vector& indices() const; + WordIndex reverseMap(int idx); + WordIndex tryForwardMap(WordIndex wIdx); + virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); }; class ShortlistGenerator { diff --git a/src/layers/generic.cpp b/src/layers/generic.cpp index 8e2ecfd79..17ef32fc6 100644 --- a/src/layers/generic.cpp +++ b/src/layers/generic.cpp @@ -4,7 +4,6 @@ #include "layers/constructors.h" #include "layers/generic.h" #include "layers/loss.h" -#include "layers/lsh.h" #include "models/states.h" // for EncoderState namespace marian {} // namespace marian diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp deleted file mode 100644 index a91778ed5..000000000 --- a/src/layers/lsh.cpp +++ /dev/null @@ -1,130 +0,0 @@ -#include "layers/lsh.h" -#include "graph/expression_operators.h" -#include "tensors/cpu/prod_blas.h" - -#if BLAS_FOUND -#include "3rd_party/faiss/IndexLSH.h" -#endif - -namespace marian { - -Expr LSH::apply(Expr input, Expr W, Expr b) { - auto idx = search(input, W); - return affine(idx, input, W, b); -} - -Expr LSH::search(Expr query, Expr values) { -#if BLAS_FOUND - ABORT_IF(query->graph()->getDeviceId().type == DeviceType::gpu, - "LSH index (--output-approx-knn) currently not implemented for GPU"); - - auto kShape = query->shape(); - kShape.set(-1, k_); - - auto forward = [this](Expr out, const std::vector& inputs) { - auto query = inputs[0]; - auto values = inputs[1]; - - int dim = values->shape()[-1]; - - if(!index_ || indexHash_ != values->hash()) { - LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_); - index_.reset(new faiss::IndexLSH(dim, nbits_, - /*rotate=*/dim != nbits_, - /*train_thesholds*/false)); - int vRows = values->shape().elements() / dim; - index_->train(vRows, values->val()->data()); - index_->add( vRows, values->val()->data()); - indexHash_ = values->hash(); - } - - int qRows = query->shape().elements() / dim; - std::vector distances(qRows * k_); - std::vector ids(qRows * k_); - - index_->search(qRows, query->val()->data(), k_, - distances.data(), ids.data()); - - std::vector vOut; - vOut.reserve(ids.size()); - for(auto id : ids) - vOut.push_back((IndexType)id); - - out->val()->set(vOut); - }; - - return lambda({query, values}, kShape, Type::uint32, forward); -#else - query; values; - ABORT("LSH output layer requires a CPU BLAS library"); -#endif -} - -Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) { - auto outShape = input->shape(); - int dimVoc = W->shape()[-2]; - outShape.set(-1, dimVoc); - - auto forward = [this](Expr out, const std::vector& inputs) { - auto lowest = NumericLimits(out->value_type()).lowest; - out->val()->set(lowest); - - int dimIn = inputs[1]->shape()[-1]; - int dimOut = out->shape()[-1]; - int dimRows = out->shape().elements() / dimOut; - - auto outPtr = out->val()->data(); - auto idxPtr = inputs[0]->val()->data(); - auto queryPtr = inputs[1]->val()->data(); - auto WPtr = inputs[2]->val()->data(); - auto bPtr = inputs.size() > 3 ? inputs[3]->val()->data() : nullptr; // nullptr if no bias given - - for(int row = 0; row < dimRows; ++row) { - auto currIdxPtr = idxPtr + row * k_; // move to next batch of k entries - auto currQueryPtr = queryPtr + row * dimIn; // move to next input query vector - auto currOutPtr = outPtr + row * dimOut; // move to next output position vector (of vocabulary size) - for(int k = 0; k < k_; k++) { - int relPos = currIdxPtr[k]; // k-th best vocabulay item - auto currWPtr = WPtr + relPos * dimIn; // offset for k-th best embedding - currOutPtr[relPos] = bPtr ? bPtr[relPos] : 0; // write bias value to position, init to 0 if no bias given - - // proceed one vector product at a time writing to the correct position - sgemm(false, true, 1, 1, dimIn, 1.0f, currQueryPtr, dimIn, currWPtr, dimIn, 1.0f, &currOutPtr[relPos], 1); - } - } - }; - - std::vector nodes = {idx, input, W}; - if(b) // bias is optional - nodes.push_back(b); - - return lambda(nodes, - outShape, - input->value_type(), - forward); -} - -// @TODO: alternative version which does the same as above with Marian operators, currently missing "scatter". -// this uses more memory and likely to be slower. Would make sense to have a scatter node that actually creates -// the node instead of relying on an existing node, e.g. scatter(shape, defaultValue, axis, indices, values); -#if 0 -Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) { - int dim = input->shape()[-1]; - int bch = idx->shape().elements() / k; - - auto W = reshape(rows(Wt_, flatten(idx)), {bch, k, dim}); // [rows, k, dim] - auto b = reshape(cols(b_, flatten(idx)), {bch, 1, k}); // [rows, 1, k] - - auto aff = reshape(bdot(reshape(input, {bch, 1, dim}), W, false, true) + b, idx->shape()); // [beam, time, batch, k] - - int dimVoc = Wt_->shape()[-2]; - auto oShape = input->shape(); - oShape.set(-1, dimVoc); - auto lowest = graph_->constant(oShape, - inits::fromValue(NumericLimits(input->value_type()).lowest), - input->value_type()); - return scatter(lowest, -1, idx, aff); -} -#endif - -} // namespace marian \ No newline at end of file diff --git a/src/layers/lsh.h b/src/layers/lsh.h deleted file mode 100644 index bf498cc60..000000000 --- a/src/layers/lsh.h +++ /dev/null @@ -1,31 +0,0 @@ -#include "graph/expression_graph.h" -#include - -namespace faiss { - struct IndexLSH; -} - -namespace marian { - -class LSH { -public: - LSH(int k, int nbits) : k_{k}, nbits_{nbits} { -#if !BLAS_FOUND - ABORT("LSH-based output approximation requires BLAS library"); -#endif - } - - Expr apply(Expr query, Expr values, Expr bias); - -private: - Ptr index_; - size_t indexHash_{0}; - - int k_{100}; - int nbits_{1024}; - - Expr search(Expr query, Expr values); - Expr affine(Expr idx, Expr query, Expr values, Expr bias); -}; - -} \ No newline at end of file diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 4c34bdcea..e9bffac4e 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -2,7 +2,6 @@ #include "common/timer.h" #include "data/factored_vocab.h" #include "layers/loss.h" -#include "layers/lsh.h" namespace marian { namespace mlp { @@ -12,13 +11,6 @@ namespace mlp { if(Wt_) return; - // this option is only set in the decoder - if(!lsh_ && options_->hasAndNotEmpty("output-approx-knn")) { - auto k = opt>("output-approx-knn")[0]; - auto nbits = opt>("output-approx-knn")[1]; - lsh_ = New(k, nbits); - } - auto name = options_->get("prefix"); auto numOutputClasses = options_->get("dim"); @@ -71,13 +63,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { }; auto affineOrLSH = [this, affineOrDot](Expr x, Expr W, Expr b, bool transA, bool transB) { - if(lsh_) { - ABORT_IF(transA, "Transposed query not supported for LSH"); - ABORT_IF(!transB, "Untransposed indexed matrix not supported for LSH"); - return lsh_->apply(x, W, b); // knows how to deal with undefined bias - } else { return affineOrDot(x, W, b, transA, transB); - } }; if(shortlist_ && !cachedShortWt_) { // shortlisted versions of parameters are cached within one diff --git a/src/layers/output.h b/src/layers/output.h index 2b6f49861..bf8a580a1 100644 --- a/src/layers/output.h +++ b/src/layers/output.h @@ -7,7 +7,6 @@ #include "marian.h" namespace marian { -class LSH; namespace mlp { @@ -28,7 +27,6 @@ class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList { // optional parameters set/updated after construction Expr tiedParam_; Ptr shortlist_; - Ptr lsh_; void lazyConstruct(int inputDim); From 592854f571e5c114c2e1f9d0469b07f0652381ce Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 28 Apr 2021 23:56:25 -0700 Subject: [PATCH 019/254] move cache variables into shortlist class --- src/data/shortlist.h | 3 +++ src/layers/output.cpp | 23 ++++++++++++----------- src/layers/output.h | 8 -------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index dd7d05894..44da6faa0 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -42,6 +42,9 @@ class Shortlist { WordIndex tryForwardMap(WordIndex wIdx); virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); + virtual Expr getCachedShortWt() const { return cachedShortWt_; } + virtual Expr getCachedShortb() const { return cachedShortb_; } + virtual Expr getCachedShortLemmaEt() const { return cachedShortLemmaEt_; } }; class ShortlistGenerator { diff --git a/src/layers/output.cpp b/src/layers/output.cpp index e9bffac4e..0d46583a3 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -66,11 +66,9 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return affineOrDot(x, W, b, transA, transB); }; - if(shortlist_ && !cachedShortWt_) { // shortlisted versions of parameters are cached within one + if(shortlist_ && !shortlist_->getCachedShortWt()) { // shortlisted versions of parameters are cached within one // batch, then clear()ed - cachedShortWt_ = index_select(Wt_, isLegacyUntransposedW ? -1 : 0, shortlist_->indices()); - if(hasBias_) - cachedShortb_ = index_select(b_, -1, shortlist_->indices()); + shortlist_->filter(input, Wt_, isLegacyUntransposedW, b_, lemmaEt_); } if(factoredVocab_) { @@ -93,8 +91,8 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // slice this group's section out of W_ Expr factorWt, factorB; if(g == 0 && shortlist_) { - factorWt = cachedShortWt_; - factorB = cachedShortb_; + factorWt = shortlist_->getCachedShortWt(); + factorB = shortlist_->getCachedShortb(); } else { factorWt = slice( Wt_, isLegacyUntransposedW ? -1 : 0, Slice((int)range.first, (int)range.second)); @@ -240,10 +238,13 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } #endif // re-embedding lookup, soft-indexed by softmax - if(shortlist_ && !cachedShortLemmaEt_) // short-listed version of re-embedding matrix - cachedShortLemmaEt_ = index_select(lemmaEt_, -1, shortlist_->indices()); + Expr cachedShortLemmaEt; + if(shortlist_) // short-listed version of re-embedding matrix + cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); + else + cachedShortLemmaEt = lemmaEt_; auto e = dot(factorSoftmax, - cachedShortLemmaEt_ ? cachedShortLemmaEt_ : lemmaEt_, + cachedShortLemmaEt, false, true); // [B... x L] // project it back to regular hidden dim @@ -265,8 +266,8 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return Logits(std::move(allLogits), factoredVocab_); } else if(shortlist_) { return Logits(affineOrLSH(input, - cachedShortWt_, - cachedShortb_, + shortlist_->getCachedShortWt(), + shortlist_->getCachedShortb(), false, /*transB=*/isLegacyUntransposedW ? false : true)); } else { diff --git a/src/layers/output.h b/src/layers/output.h index bf8a580a1..d3afdeadf 100644 --- a/src/layers/output.h +++ b/src/layers/output.h @@ -19,9 +19,6 @@ class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList { bool isLegacyUntransposedW{false}; // legacy-model emulation: W is stored in non-transposed form bool hasBias_{true}; - Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) - Expr cachedShortb_; // these match the current value of shortlist_ - Expr cachedShortLemmaEt_; Ptr factoredVocab_; // optional parameters set/updated after construction @@ -49,8 +46,6 @@ class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList { ABORT_IF(shortlist.get() != shortlist_.get(), "Output shortlist cannot be changed except after clear()"); else { - ABORT_IF(cachedShortWt_ || cachedShortb_ || cachedShortLemmaEt_, - "No shortlist but cached parameters??"); shortlist_ = shortlist; } // cachedShortWt_ and cachedShortb_ will be created lazily inside apply() @@ -60,9 +55,6 @@ class Output : public LayerBase, public IUnaryLogitLayer, public IHasShortList { // cachedShortWt_ etc. in the graph's short-term cache void clear() override final { shortlist_ = nullptr; - cachedShortWt_ = nullptr; - cachedShortb_ = nullptr; - cachedShortLemmaEt_ = nullptr; } Logits applyAsLogits(Expr input) override final; From 67fe82f8401e83efffb6286893c9d2ea8d967115 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 00:08:21 -0700 Subject: [PATCH 020/254] start broadcast --- src/data/shortlist.cpp | 44 ++++++++++++++++++++++++++++++++++++++++-- src/data/shortlist.h | 3 ++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 67317f4be..886e74fea 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -29,16 +29,56 @@ WordIndex Shortlist::tryForwardMap(WordIndex wIdx) { } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - int k = indices_.size(); + //if (indicesExpr_) return; int currBeamSize = input->shape()[0]; int batchSize = input->shape()[2]; std::cerr << "currBeamSize=" << currBeamSize << std::endl; std::cerr << "batchSize=" << batchSize << std::endl; - Expr indicesExprBC; + auto forward = [this](Expr out, const std::vector& inputs) { + out->val()->set(indices_); + }; + + int k = indices_.size(); + Shape kShape({k}); + indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); + + Expr indicesExprBC = getIndicesExpr(batchSize, currBeamSize); broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExprBC, k); } +Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { + int k = indicesExpr_->shape()[0]; + Expr ones = indicesExpr_->graph()->constant({batchSize, beamSize, 1}, inits::ones(), Type::float32); + + Expr tmp = reshape(indicesExpr_, {1, k}); + tmp = cast(tmp, Type::float32); + + Expr out = ones * tmp; + //debug(out, "out.1"); + + auto forward = [](Expr out, const std::vector& inputs) { + Expr in = inputs[0]; + const Shape &shape = in->shape(); + const float *inPtr = in->val()->data(); + uint32_t *outPtr = out->val()->data(); + + for (int i = 0; i < shape.elements(); ++i) { + const float &val = inPtr[i]; + uint32_t valConv = (uint32_t)val; + uint32_t &valOut = outPtr[i]; + valOut = valConv; + //std::cerr << val << " " << valConv << " " << valOut << std::endl; + } + }; + out = lambda({out}, out->shape(), Type::uint32, forward); + //debug(out, "out.2"); + //out = cast(out, Type::uint32); + //std::cerr << "getIndicesExpr.2=" << out->shape() << std::endl; + //out = reshape(out, {k}); + + return out; +} void Shortlist::broadcast(Expr weights, bool isLegacyUntransposedW, diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 44da6faa0..67a8b74c0 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -21,7 +21,7 @@ namespace data { class Shortlist { protected: std::vector indices_; // // [packed shortlist index] -> word index, used to select columns from output embeddings - + Expr indicesExpr_; Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; @@ -42,6 +42,7 @@ class Shortlist { WordIndex tryForwardMap(WordIndex wIdx); virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); + virtual Expr getIndicesExpr(int batchSize, int currBeamSize) const; virtual Expr getCachedShortWt() const { return cachedShortWt_; } virtual Expr getCachedShortb() const { return cachedShortb_; } virtual Expr getCachedShortLemmaEt() const { return cachedShortLemmaEt_; } From f41acb1aa86da3b7f357e63218550026835564da Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 00:31:01 -0700 Subject: [PATCH 021/254] start factor mask --- src/data/shortlist.cpp | 29 ++++++++++++++++++++++++++++- src/layers/logits.cpp | 15 ++++++++++----- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 886e74fea..f82eb9ccb 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -86,13 +86,40 @@ void Shortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { + ///* cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indices()); if (b) { cachedShortb_ = index_select(b, -1, indices()); } cachedShortLemmaEt_ = index_select(lemmaEt, -1, indices()); return; - + //*/ + int batchSize = indicesExprBC->shape()[0]; + int currBeamSize = indicesExprBC->shape()[1]; + //int numHypos = batchSize * currBeamSize; + //std::cerr << "batchSize=" << batchSize << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << std::endl; + indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); + //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); + //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); + //std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = transpose(cachedShortWt_, {1, 2, 0, 3}); + //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + + if (b) { + assert(false); + cachedShortb_ = index_select(b, -1, indicesExprBC); + cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested + } + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); + //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {batchSize, currBeamSize, k, cachedShortLemmaEt_->shape()[0]}); + //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 3, 0, 2}); + //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } ////////////////////////////////////////////////////////////////////////////////////// QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 8c4d69bde..4f0ad8151 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -96,11 +96,16 @@ Expr Logits::getFactoredLogits(size_t groupIndex, for(size_t g = 1; g < numGroups; g++) { auto factorMaxima = max(logits_[g]->loss(), -1); // we cast since loss is likely ce-loss which has type float32 - auto factorMasks = constant( - getFactorMasks(g, shortlist ? shortlist->indices() : std::vector())); - sel = sel - + cast(factorMaxima, sel->value_type()) - * cast(factorMasks, sel->value_type()); // those lemmas that don't have a factor + Expr factorMasks; + if (!shortlist) { + factorMasks = constant(getFactorMasks(g, std::vector())); + } + else { + factorMasks = constant(getFactorMasks(g, shortlist->indices())); + } + factorMaxima = cast(factorMaxima, sel->value_type()); + factorMasks = cast(factorMasks, sel->value_type()); + sel = sel + factorMaxima * factorMasks; // those lemmas that don't have a factor // get multiplied with 0 } } From 6b2b7d11880013c9574e6bcd8d67bef4f28be97c Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 00:44:30 -0700 Subject: [PATCH 022/254] factor mask --- src/layers/logits.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++- src/layers/logits.h | 1 + 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 4f0ad8151..c327bd0d7 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -101,7 +101,28 @@ Expr Logits::getFactoredLogits(size_t groupIndex, factorMasks = constant(getFactorMasks(g, std::vector())); } else { - factorMasks = constant(getFactorMasks(g, shortlist->indices())); + //std::cerr << "sel=" << sel->shape() << std::endl; + int currBeamSize = sel->shape()[0]; + int batchSize = sel->shape()[2]; + + auto forward = [this, g, currBeamSize, batchSize](Expr out, const std::vector& inputs) { + std::vector indices; + Expr lastIndices = inputs[0]; + lastIndices->val()->get(indices); + std::vector masks = getFactorMasks2(batchSize, currBeamSize, g, indices); + out->val()->set(masks); + }; + + Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); + //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; + factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); + //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + factorMasks = transpose(factorMasks, {1, 0, 2}); + //std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; + + const Shape &s = factorMasks->shape(); + factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); + //std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); factorMasks = cast(factorMasks, sel->value_type()); @@ -219,6 +240,27 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< return res; } +std::vector Logits::getFactorMasks2(int batchSize, int currBeamSize, size_t factorGroup, const std::vector& indices) + const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 + size_t n + = indices.empty() + ? (factoredVocab_->getGroupRange(0).second - factoredVocab_->getGroupRange(0).first) + : indices.size() / currBeamSize; + std::vector res; + res.reserve(currBeamSize * n); + + // @TODO: we should rearrange lemmaHasFactorGroup as vector[groups[i] of float; then move this + // into FactoredVocab + for (size_t currBeam = 0; currBeam < currBeamSize; ++currBeam) { + for(size_t i = 0; i < n; i++) { + size_t idx = currBeam * n + i; + size_t lemma = indices.empty() ? i : (indices[idx] - factoredVocab_->getGroupRange(0).first); + res.push_back((float)factoredVocab_->lemmaHasFactorGroup(lemma, factorGroup)); + } + } + return res; +} + Logits Logits::applyUnaryFunction( const std::function& f) const { // clone this but apply f to all loss values std::vector> newLogits; diff --git a/src/layers/logits.h b/src/layers/logits.h index c61a9e742..1c93926d3 100644 --- a/src/layers/logits.h +++ b/src/layers/logits.h @@ -80,6 +80,7 @@ class Logits { } // actually the same as constant(data) for this data type std::vector getFactorMasks(size_t factorGroup, const std::vector& indices) const; + std::vector getFactorMasks2(int batchSize, int currBeamSize, size_t factorGroup, const std::vector& indices) const; private: // members From 5225331767c4f1f5dc4104750f23a0580487d553 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 02:22:26 -0700 Subject: [PATCH 023/254] cachedShortLemmaEt_ works --- src/data/shortlist.cpp | 33 ++++++++++++++++++++------------- src/layers/output.cpp | 21 ++++++++++++++------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index f82eb9ccb..d384cbed4 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -86,21 +86,34 @@ void Shortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { + int batchSize = indicesExprBC->shape()[0]; + int currBeamSize = indicesExprBC->shape()[1]; + //int numHypos = batchSize * currBeamSize; + //std::cerr << "batchSize=" << batchSize << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << std::endl; ///* cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indices()); if (b) { cachedShortb_ = index_select(b, -1, indices()); } + std::cerr << "lemmaEt=" << lemmaEt->shape() << std::endl; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indices()); + std::cerr << "cachedShortLemmaEt_=" << cachedShortLemmaEt_->shape() << std::endl; + + + indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); + std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); + std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); + std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); + std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; + return; //*/ - int batchSize = indicesExprBC->shape()[0]; - int currBeamSize = indicesExprBC->shape()[1]; - //int numHypos = batchSize * currBeamSize; - //std::cerr << "batchSize=" << batchSize << std::endl; - //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); - //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; @@ -114,12 +127,6 @@ void Shortlist::broadcast(Expr weights, cachedShortb_ = index_select(b, -1, indicesExprBC); cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested } - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {batchSize, currBeamSize, k, cachedShortLemmaEt_->shape()[0]}); - //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 3, 0, 2}); - //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } ////////////////////////////////////////////////////////////////////////////////////// QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 0d46583a3..4a7ae2678 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -66,7 +66,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return affineOrDot(x, W, b, transA, transB); }; - if(shortlist_ && !shortlist_->getCachedShortWt()) { // shortlisted versions of parameters are cached within one + if(shortlist_) { // shortlisted versions of parameters are cached within one // batch, then clear()ed shortlist_->filter(input, Wt_, isLegacyUntransposedW, b_, lemmaEt_); } @@ -241,12 +241,19 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { Expr cachedShortLemmaEt; if(shortlist_) // short-listed version of re-embedding matrix cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); - else - cachedShortLemmaEt = lemmaEt_; - auto e = dot(factorSoftmax, - cachedShortLemmaEt, - false, - true); // [B... x L] + else { + const Shape &s = lemmaEt_->shape(); + cachedShortLemmaEt = reshape(lemmaEt_, {1, s[0], 1, s[1]}); + } + //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; + Expr e = factorSoftmax * cachedShortLemmaEt; + //std::cerr << "e.1=" << e->shape() << std::endl; + e = sum(e, 3); + //std::cerr << "e.2=" << e->shape() << std::endl; + e = transpose(e, {0, 3, 2, 1}); + //std::cerr << "e.3=" << e->shape() << std::endl; + // project it back to regular hidden dim int inputDim = input1->shape()[-1]; auto name = options_->get("prefix"); From d41353eeb79010717dad0c0e3eb03cff03498672 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 11:24:00 -0700 Subject: [PATCH 024/254] get ready for cachedShortWt_ --- src/data/shortlist.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index d384cbed4..e73372ac9 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -91,25 +91,26 @@ void Shortlist::broadcast(Expr weights, //int numHypos = batchSize * currBeamSize; //std::cerr << "batchSize=" << batchSize << std::endl; //std::cerr << "currBeamSize=" << currBeamSize << std::endl; + std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; + ABORT_IF(!isLegacyUntransposedW, "Legacy untranspose W not yet tested"); ///* + std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indices()); + std::cerr << "cachedShortWt_=" << cachedShortWt_->shape() << std::endl; if (b) { + ABORT("Bias not yet tested"); cachedShortb_ = index_select(b, -1, indices()); } - std::cerr << "lemmaEt=" << lemmaEt->shape() << std::endl; - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indices()); - std::cerr << "cachedShortLemmaEt_=" << cachedShortLemmaEt_->shape() << std::endl; - indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); - std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); - std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); - std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; return; //*/ From e518fc96669b17844bf12fd23388b80116f978ed Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 11:59:25 -0700 Subject: [PATCH 025/254] cachedShortWt_ works --- src/data/shortlist.cpp | 36 ++++++++++++------------------------ src/layers/output.cpp | 27 ++++++++++++++++++++------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index e73372ac9..d76a5f8de 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -32,8 +32,8 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp //if (indicesExpr_) return; int currBeamSize = input->shape()[0]; int batchSize = input->shape()[2]; - std::cerr << "currBeamSize=" << currBeamSize << std::endl; - std::cerr << "batchSize=" << batchSize << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << std::endl; + //std::cerr << "batchSize=" << batchSize << std::endl; auto forward = [this](Expr out, const std::vector& inputs) { out->val()->set(indices_); @@ -91,31 +91,12 @@ void Shortlist::broadcast(Expr weights, //int numHypos = batchSize * currBeamSize; //std::cerr << "batchSize=" << batchSize << std::endl; //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; - ABORT_IF(!isLegacyUntransposedW, "Legacy untranspose W not yet tested"); - ///* - std::cerr << "weights=" << weights->shape() << std::endl; - cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indices()); - std::cerr << "cachedShortWt_=" << cachedShortWt_->shape() << std::endl; - if (b) { - ABORT("Bias not yet tested"); - cachedShortb_ = index_select(b, -1, indices()); - } + //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; + ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); - //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); - //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; - - return; - //*/ - - cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); @@ -124,10 +105,17 @@ void Shortlist::broadcast(Expr weights, //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; if (b) { - assert(false); + ABORT("Bias not yet tested"); cachedShortb_ = index_select(b, -1, indicesExprBC); cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested } + + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); + //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); + //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); + //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } ////////////////////////////////////////////////////////////////////////////////////// QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 4a7ae2678..9046a09e0 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -62,8 +62,19 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return dot(x, W, transA, transB); }; - auto affineOrLSH = [this, affineOrDot](Expr x, Expr W, Expr b, bool transA, bool transB) { - return affineOrDot(x, W, b, transA, transB); + auto affineShortlist = [this, affineOrDot](Expr x, Expr W, Expr b, bool transA, bool transB) { + //std::cerr << "x=" << x->shape() << std::endl; + //std::cerr << "W=" << W->shape() << std::endl; + //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; + + Expr ret = x * W; + ret = sum(ret, 3); + //const Shape &retShape = ret->shape(); + //std::cerr << "ret.1=" << retShape << std::endl; + ret = transpose(ret, {0, 3, 2, 1}); + //ret = reshape(ret, {retShape[0], 1, 1, retShape[2]}); + //std::cerr << "ret.2=" << ret->shape() << std::endl; + return ret; }; if(shortlist_) { // shortlisted versions of parameters are cached within one @@ -164,20 +175,22 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a // matrix Expr factorLogits; - if(g == 0) - factorLogits = affineOrLSH( + if(g == 0 && shortlist_) { + factorLogits = affineShortlist( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - else + } + else { factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits + } // optionally add lemma-dependent bias if(Plemma) { // [B... x U0] @@ -272,14 +285,14 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } return Logits(std::move(allLogits), factoredVocab_); } else if(shortlist_) { - return Logits(affineOrLSH(input, + return Logits(affineOrDot(input, shortlist_->getCachedShortWt(), shortlist_->getCachedShortb(), false, /*transB=*/isLegacyUntransposedW ? false : true)); } else { return Logits( - affineOrLSH(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true)); + affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true)); } } From 1784da05856a13234c457de57ba31690a43ca979 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 12:22:45 -0700 Subject: [PATCH 026/254] start lsh --- src/data/shortlist.cpp | 132 ++++++++++++++++++++++++++++++--- src/data/shortlist.h | 38 +++++++++- src/translator/beam_search.cpp | 6 +- src/translator/translator.h | 5 +- 4 files changed, 165 insertions(+), 16 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index d76a5f8de..0a6841d61 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -2,6 +2,10 @@ #include "microsoft/shortlist/utils/ParameterTree.h" #include "marian.h" +#if BLAS_FOUND +#include "3rd_party/faiss/IndexLSH.h" +#endif + namespace marian { namespace data { @@ -18,9 +22,9 @@ Shortlist::Shortlist(const std::vector& indices) : indices_(indices) {} const std::vector& Shortlist::indices() const { return indices_; } -WordIndex Shortlist::reverseMap(int idx) { return indices_[idx]; } +WordIndex Shortlist::reverseMap(size_t beamIdx, int idx) const { return indices_[idx]; } -WordIndex Shortlist::tryForwardMap(WordIndex wIdx) { +WordIndex Shortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found @@ -117,6 +121,110 @@ void Shortlist::broadcast(Expr weights, cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } + +/////////////////////////////////////////////////////////////////////////////////// +Ptr LSHShortlist::index_; + +LSHShortlist::LSHShortlist(int k, int nbits) +: Shortlist(std::vector()) +, k_(k), nbits_(nbits) { + //std::cerr << "LSHShortlist" << std::endl; + /* + for (int i = 0; i < k_; ++i) { + indices_.push_back(i); + } + */ +} + +#define BLAS_FOUND 1 + +WordIndex LSHShortlist::reverseMap(size_t beamIdx, int idx) const { + idx = k_ * beamIdx + idx; + assert(idx < indices_.size()); + return indices_[idx]; +} + +WordIndex LSHShortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { + //utils::Debug(indices_, "LSHShortlist::tryForwardMap indices_"); + auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); + bool found = first != indices_.end(); + if(found && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx + return (int)std::distance(indices_.begin(), first); // return coordinate if found + else + return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? +} + +Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { + assert(indicesExpr_->shape()[0] == currBeamSize); + return indicesExpr_; +} + +void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { +#if BLAS_FOUND + int currBeamSize = input->shape()[0]; + ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, + "LSH index (--output-approx-knn) currently not implemented for GPU"); + + auto forward = [this, currBeamSize](Expr out, const std::vector& inputs) { + auto query = inputs[0]; + auto values = inputs[1]; + int dim = values->shape()[-1]; + + if(!index_) { + //std::cerr << "build lsh index" << std::endl; + LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_); + index_.reset(new faiss::IndexLSH(dim, nbits_, + /*rotate=*/dim != nbits_, + /*train_thesholds*/false)); + int vRows = 32121; //47960; //values->shape().elements() / dim; + index_->train(vRows, values->val()->data()); + index_->add( vRows, values->val()->data()); + } + + int qRows = query->shape().elements() / dim; + std::vector distances(qRows * k_); + std::vector ids(qRows * k_); + + index_->search(qRows, query->val()->data(), k_, + distances.data(), ids.data()); + + indices_.clear(); + for(auto id : ids) { + indices_.push_back(id); + } + + for (size_t beamIdx = 0; beamIdx < currBeamSize; ++beamIdx) { + size_t startIdx = k_ * beamIdx; + size_t endIdx = startIdx + k_; + std::sort(indices_.begin() + startIdx, indices_.begin() + endIdx); + } + out->val()->set(indices_); + //std::cerr << "out=" << out->shape() << " " << out->val() << std::endl; + }; + + Shape kShape({currBeamSize, k_}); + //std::cerr << "kShape=" << kShape << std::endl; + + indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); + //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + + broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExpr_, k_); + +#else + query; values; + ABORT("LSH output layer requires a CPU BLAS library"); +#endif +} + +LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) + : k_(k), nbits_(nbits) { + //std::cerr << "LSHShortlistGenerator" << std::endl; +} + +Ptr LSHShortlistGenerator::generate(Ptr batch) const { + return New(k_, nbits_); +} + ////////////////////////////////////////////////////////////////////////////////////// QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, Ptr srcVocab, @@ -242,16 +350,22 @@ Ptr QuicksandShortlistGenerator::generate(Ptr batc Ptr createShortlistGenerator(Ptr options, Ptr srcVocab, Ptr trgVocab, + const std::vector &lshOpts, size_t srcIdx, size_t trgIdx, bool shared) { - std::vector vals = options->get>("shortlist"); - ABORT_IF(vals.empty(), "No path to shortlist given"); - std::string fname = vals[0]; - if(filesystem::Path(fname).extension().string() == ".bin") { - return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); - } else { - return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); + if (lshOpts.size() == 2) { + return New(lshOpts[0], lshOpts[1]); + } + else { + std::vector vals = options->get>("shortlist"); + ABORT_IF(vals.empty(), "No path to shortlist given"); + std::string fname = vals[0]; + if(filesystem::Path(fname).extension().string() == ".bin") { + return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); + } else { + return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); + } } } diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 67a8b74c0..40702dfc9 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -15,6 +15,10 @@ #include #include +namespace faiss { + struct IndexLSH; +} + namespace marian { namespace data { @@ -38,8 +42,8 @@ class Shortlist { Shortlist(const std::vector& indices); const std::vector& indices() const; - WordIndex reverseMap(int idx); - WordIndex tryForwardMap(WordIndex wIdx); + virtual WordIndex reverseMap(size_t beamIdx, int idx) const; + virtual WordIndex tryForwardMap(size_t beamIdx, WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); virtual Expr getIndicesExpr(int batchSize, int currBeamSize) const; @@ -61,6 +65,35 @@ class ShortlistGenerator { } }; +/////////////////////////////////////////////////////////////////////////////////// +class LSHShortlist: public Shortlist { +private: + int k_; + int nbits_; + + static Ptr index_; + +public: + LSHShortlist(int k, int nbits); + virtual WordIndex reverseMap(size_t beamIdx, int idx) const override; + virtual WordIndex tryForwardMap(size_t beamIdx, WordIndex wIdx) const override; + + virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; + virtual Expr getIndicesExpr(int batchSize,int currBeamSize) const override; + +}; + +class LSHShortlistGenerator : public ShortlistGenerator { +private: + int k_; + int nbits_; + +public: + LSHShortlistGenerator(int k, int nbits); + Ptr generate(Ptr batch) const override; +}; + +/////////////////////////////////////////////////////////////////////////////////// // Intended for use during training in the future, currently disabled #if 0 @@ -345,6 +378,7 @@ unless the extension is *.bin for which the Microsoft legacy binary shortlist is Ptr createShortlistGenerator(Ptr options, Ptr srcVocab, Ptr trgVocab, + const std::vector &lshOpts, size_t srcIdx = 0, size_t trgIdx = 1, bool shared = false); diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 91dde6e6f..d7ecf5eee 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap(wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap(prevBeamHypIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); @@ -308,7 +308,7 @@ Histories BeamSearch::search(Ptr graph, Ptr suppressed.erase(std::remove_if(suppressed.begin(), suppressed.end(), [&](WordIndex i) { - return shortlist->tryForwardMap(i) == data::Shortlist::npos; + return shortlist->tryForwardMap(3343, i) == data::Shortlist::npos; // TODO beamIdx }), suppressed.end()); diff --git a/src/translator/translator.h b/src/translator/translator.h index fe01065b6..511a42507 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -62,8 +62,9 @@ class Translate : public ModelTask { trgVocab_->load(vocabs.back()); auto srcVocab = corpus_->getVocabs()[0]; - if(options_->hasAndNotEmpty("shortlist")) - shortlistGenerator_ = data::createShortlistGenerator(options_, srcVocab, trgVocab_, 0, 1, vocabs.front() == vocabs.back()); + std::vector lshOpts = options_->get>("output-approx-knn"); + if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) + shortlistGenerator_ = data::createShortlistGenerator(options_, srcVocab, trgVocab_, lshOpts, 0, 1, vocabs.front() == vocabs.back()); auto devices = Config::getDevices(options_); numDevices_ = devices.size(); From 947301a8172c2706fb7acac293a2b63b9d5ef6c4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 12:54:35 -0700 Subject: [PATCH 027/254] lsh runs but crap output --- src/data/shortlist.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 0a6841d61..afc95a776 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -90,6 +90,7 @@ void Shortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { + std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; int batchSize = indicesExprBC->shape()[0]; int currBeamSize = indicesExprBC->shape()[1]; //int numHypos = batchSize * currBeamSize; @@ -155,17 +156,23 @@ WordIndex LSHShortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { } Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { - assert(indicesExpr_->shape()[0] == currBeamSize); + std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; + assert(indicesExpr_->shape()[0] == batchSize); + assert(indicesExpr_->shape()[1] == currBeamSize); return indicesExpr_; } void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { #if BLAS_FOUND - int currBeamSize = input->shape()[0]; ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, "LSH index (--output-approx-knn) currently not implemented for GPU"); - auto forward = [this, currBeamSize](Expr out, const std::vector& inputs) { + int currBeamSize = input->shape()[0]; + int batchSize = input->shape()[2]; + int numHypos = currBeamSize * batchSize; + + auto forward = [this, numHypos](Expr out, const std::vector& inputs) { auto query = inputs[0]; auto values = inputs[1]; int dim = values->shape()[-1]; @@ -193,16 +200,15 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, indices_.push_back(id); } - for (size_t beamIdx = 0; beamIdx < currBeamSize; ++beamIdx) { - size_t startIdx = k_ * beamIdx; + for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { + size_t startIdx = k_ * hypoIdx; size_t endIdx = startIdx + k_; std::sort(indices_.begin() + startIdx, indices_.begin() + endIdx); } out->val()->set(indices_); - //std::cerr << "out=" << out->shape() << " " << out->val() << std::endl; }; - Shape kShape({currBeamSize, k_}); + Shape kShape({batchSize, currBeamSize, k_}); //std::cerr << "kShape=" << kShape << std::endl; indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); From 1a3e5ab58ed6b0877aec10392f0b9858bb2ac28a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 13:01:03 -0700 Subject: [PATCH 028/254] debug --- src/data/shortlist.cpp | 4 ++-- src/data/shortlist.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index afc95a776..f912284f8 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -156,8 +156,8 @@ WordIndex LSHShortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { } Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { - std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; + //std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; + //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; assert(indicesExpr_->shape()[0] == batchSize); assert(indicesExpr_->shape()[1] == currBeamSize); return indicesExpr_; diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 40702dfc9..1e23e47ba 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -36,12 +36,12 @@ class Shortlist { Expr lemmaEt, Expr indicesExprBC, int k); + const std::vector& indices() const; public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos Shortlist(const std::vector& indices); - const std::vector& indices() const; virtual WordIndex reverseMap(size_t beamIdx, int idx) const; virtual WordIndex tryForwardMap(size_t beamIdx, WordIndex wIdx) const; From daf853e7aa407002dd8b77eea320a0b8f95d3c34 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 13:43:31 -0700 Subject: [PATCH 029/254] batch idx nearly there --- src/data/shortlist.cpp | 15 ++++++++++----- src/data/shortlist.h | 8 ++++---- src/translator/beam_search.cpp | 13 +++++++++---- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index f912284f8..b0fc3b921 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -22,9 +22,9 @@ Shortlist::Shortlist(const std::vector& indices) : indices_(indices) {} const std::vector& Shortlist::indices() const { return indices_; } -WordIndex Shortlist::reverseMap(size_t beamIdx, int idx) const { return indices_[idx]; } +WordIndex Shortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) const { return indices_[idx]; } -WordIndex Shortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { +WordIndex Shortlist::tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found @@ -139,13 +139,18 @@ LSHShortlist::LSHShortlist(int k, int nbits) #define BLAS_FOUND 1 -WordIndex LSHShortlist::reverseMap(size_t beamIdx, int idx) const { - idx = k_ * beamIdx + idx; +WordIndex LSHShortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) const { + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + int currBeamSize = indicesExpr_->shape()[1]; + std::cerr << "currBeamSize=" << currBeamSize << std::endl; + std::cerr << "indices_=" << indices_.size() << std::endl; + idx = (k_ * currBeamSize) * batchIdx + k_ * beamIdx + idx; + std::cerr << "idx=" << idx << std::endl; assert(idx < indices_.size()); return indices_[idx]; } -WordIndex LSHShortlist::tryForwardMap(size_t beamIdx, WordIndex wIdx) const { +WordIndex LSHShortlist::tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const { //utils::Debug(indices_, "LSHShortlist::tryForwardMap indices_"); auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); bool found = first != indices_.end(); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 1e23e47ba..b28a32676 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -42,8 +42,8 @@ class Shortlist { Shortlist(const std::vector& indices); - virtual WordIndex reverseMap(size_t beamIdx, int idx) const; - virtual WordIndex tryForwardMap(size_t beamIdx, WordIndex wIdx) const; + virtual WordIndex reverseMap(size_t batchIdx, size_t beamIdx, int idx) const; + virtual WordIndex tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); virtual Expr getIndicesExpr(int batchSize, int currBeamSize) const; @@ -75,8 +75,8 @@ class LSHShortlist: public Shortlist { public: LSHShortlist(int k, int nbits); - virtual WordIndex reverseMap(size_t beamIdx, int idx) const override; - virtual WordIndex tryForwardMap(size_t beamIdx, WordIndex wIdx) const override; + virtual WordIndex reverseMap(size_t batchIdx, size_t beamIdx, int idx) const override; + virtual WordIndex tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; virtual Expr getIndicesExpr(int batchSize,int currBeamSize) const override; diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index d7ecf5eee..deddd7821 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -50,7 +50,6 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current const auto beamHypIdx = (key / vocabSize) % nBestBeamSize; const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize; const auto origBatchIdx = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam - bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx] && factorGroup == 0; WordIndex wordIdx; @@ -85,6 +84,12 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // map wordIdx to word auto prevBeamHypIdx = beamHypIdx; // back pointer + std::cerr << "currentBatchIdx=" << currentBatchIdx + << " origBatchIdx=" << origBatchIdx + << " beamHypIdx=" << beamHypIdx + << " prevBeamHypIdx=" << prevBeamHypIdx + << std::endl; + auto prevHyp = beam[prevBeamHypIdx]; Word word; // If short list has been set, then wordIdx is an index into the short-listed word set, @@ -94,7 +99,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(currentBatchIdx, prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -115,7 +120,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap(prevBeamHypIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap(currentBatchIdx, prevBeamHypIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); @@ -308,7 +313,7 @@ Histories BeamSearch::search(Ptr graph, Ptr suppressed.erase(std::remove_if(suppressed.begin(), suppressed.end(), [&](WordIndex i) { - return shortlist->tryForwardMap(3343, i) == data::Shortlist::npos; // TODO beamIdx + return shortlist->tryForwardMap(4545, 3343, i) == data::Shortlist::npos; // TODO beamIdx }), suppressed.end()); From 1672201450d707bdd25d4cbe826e50b224de230d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Apr 2021 15:33:49 -0700 Subject: [PATCH 030/254] use origBatchIdx --- src/data/shortlist.cpp | 5 +++-- src/translator/beam_search.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index b0fc3b921..071761285 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -140,9 +140,10 @@ LSHShortlist::LSHShortlist(int k, int nbits) #define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) const { - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; + //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; int currBeamSize = indicesExpr_->shape()[1]; - std::cerr << "currBeamSize=" << currBeamSize << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << std::endl; std::cerr << "indices_=" << indices_.size() << std::endl; idx = (k_ * currBeamSize) * batchIdx + k_ * beamIdx + idx; std::cerr << "idx=" << idx << std::endl; diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index deddd7821..e3d536286 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -84,11 +84,11 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // map wordIdx to word auto prevBeamHypIdx = beamHypIdx; // back pointer - std::cerr << "currentBatchIdx=" << currentBatchIdx + /*std::cerr << "currentBatchIdx=" << currentBatchIdx << " origBatchIdx=" << origBatchIdx << " beamHypIdx=" << beamHypIdx << " prevBeamHypIdx=" << prevBeamHypIdx - << std::endl; + << std::endl;*/ auto prevHyp = beam[prevBeamHypIdx]; Word word; @@ -99,7 +99,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(currentBatchIdx, prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(origBatchIdx, prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -120,7 +120,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap(currentBatchIdx, prevBeamHypIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap(origBatchIdx, prevBeamHypIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); From 5be82498aefb17ec549e2b6e6e5544eea00886e7 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 00:13:31 -0700 Subject: [PATCH 031/254] virtual destructor --- src/data/shortlist.cpp | 12 +++++++----- src/data/shortlist.h | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 071761285..9add04a58 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -21,10 +21,12 @@ const T* get(const void*& current, size_t num = 1) { Shortlist::Shortlist(const std::vector& indices) : indices_(indices) {} +Shortlist::~Shortlist() {} + const std::vector& Shortlist::indices() const { return indices_; } -WordIndex Shortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) const { return indices_[idx]; } +WordIndex Shortlist::reverseMap(size_t , size_t , int idx) const { return indices_[idx]; } -WordIndex Shortlist::tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const { +WordIndex Shortlist::tryForwardMap(size_t , size_t , WordIndex wIdx) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found @@ -39,7 +41,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp //std::cerr << "currBeamSize=" << currBeamSize << std::endl; //std::cerr << "batchSize=" << batchSize << std::endl; - auto forward = [this](Expr out, const std::vector& inputs) { + auto forward = [this](Expr out, const std::vector& ) { out->val()->set(indices_); }; @@ -151,7 +153,7 @@ WordIndex LSHShortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) con return indices_[idx]; } -WordIndex LSHShortlist::tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const { +WordIndex LSHShortlist::tryForwardMap(size_t , size_t , WordIndex wIdx) const { //utils::Debug(indices_, "LSHShortlist::tryForwardMap indices_"); auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); bool found = first != indices_.end(); @@ -203,7 +205,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, indices_.clear(); for(auto id : ids) { - indices_.push_back(id); + indices_.push_back((WordIndex)id); } for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index b28a32676..f0ddbaf0d 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -41,7 +41,8 @@ class Shortlist { static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos Shortlist(const std::vector& indices); - + virtual ~Shortlist(); + virtual WordIndex reverseMap(size_t batchIdx, size_t beamIdx, int idx) const; virtual WordIndex tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const; From 84d498756b41844ebb38b729296a0b7b6a7b3db3 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 09:52:56 -0700 Subject: [PATCH 032/254] warnings --- src/data/shortlist.cpp | 10 +++++----- src/data/shortlist.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 9add04a58..081ad3426 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -24,9 +24,9 @@ Shortlist::Shortlist(const std::vector& indices) Shortlist::~Shortlist() {} const std::vector& Shortlist::indices() const { return indices_; } -WordIndex Shortlist::reverseMap(size_t , size_t , int idx) const { return indices_[idx]; } +WordIndex Shortlist::reverseMap(int , int , int idx) const { return indices_[idx]; } -WordIndex Shortlist::tryForwardMap(size_t , size_t , WordIndex wIdx) const { +WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found @@ -45,7 +45,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp out->val()->set(indices_); }; - int k = indices_.size(); + int k = (int) indices_.size(); Shape kShape({k}); indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); @@ -141,7 +141,7 @@ LSHShortlist::LSHShortlist(int k, int nbits) #define BLAS_FOUND 1 -WordIndex LSHShortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) const { +WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; int currBeamSize = indicesExpr_->shape()[1]; @@ -153,7 +153,7 @@ WordIndex LSHShortlist::reverseMap(size_t batchIdx, size_t beamIdx, int idx) con return indices_[idx]; } -WordIndex LSHShortlist::tryForwardMap(size_t , size_t , WordIndex wIdx) const { +WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { //utils::Debug(indices_, "LSHShortlist::tryForwardMap indices_"); auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); bool found = first != indices_.end(); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index f0ddbaf0d..d61c90fa0 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -43,8 +43,8 @@ class Shortlist { Shortlist(const std::vector& indices); virtual ~Shortlist(); - virtual WordIndex reverseMap(size_t batchIdx, size_t beamIdx, int idx) const; - virtual WordIndex tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const; + virtual WordIndex reverseMap(int batchIdx, int beamIdx, int idx) const; + virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); virtual Expr getIndicesExpr(int batchSize, int currBeamSize) const; @@ -76,8 +76,8 @@ class LSHShortlist: public Shortlist { public: LSHShortlist(int k, int nbits); - virtual WordIndex reverseMap(size_t batchIdx, size_t beamIdx, int idx) const override; - virtual WordIndex tryForwardMap(size_t batchIdx, size_t beamIdx, WordIndex wIdx) const override; + virtual WordIndex reverseMap(int batchIdx, int beamIdx, int idx) const override; + virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; virtual Expr getIndicesExpr(int batchSize,int currBeamSize) const override; From 1e62a16bc14c167f9bca2664e3f6991cf9a8d240 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 10:53:18 -0700 Subject: [PATCH 033/254] warnings --- src/layers/output.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 9046a09e0..b19506f05 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -62,7 +62,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return dot(x, W, transA, transB); }; - auto affineShortlist = [this, affineOrDot](Expr x, Expr W, Expr b, bool transA, bool transB) { + auto affineShortlist = [this, affineOrDot](Expr x, Expr W, Expr b, bool , bool ) { //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; From b8153bba8efb114710b9335df5b3ddeb980b476a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 12:02:15 -0700 Subject: [PATCH 034/254] warnings --- src/layers/logits.cpp | 2 +- src/layers/output.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index c327bd0d7..d97f860ac 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -240,7 +240,7 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< return res; } -std::vector Logits::getFactorMasks2(int batchSize, int currBeamSize, size_t factorGroup, const std::vector& indices) +std::vector Logits::getFactorMasks2(int , int currBeamSize, size_t factorGroup, const std::vector& indices) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 size_t n = indices.empty() diff --git a/src/layers/output.cpp b/src/layers/output.cpp index b19506f05..6b62a9cd1 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -62,7 +62,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return dot(x, W, transA, transB); }; - auto affineShortlist = [this, affineOrDot](Expr x, Expr W, Expr b, bool , bool ) { + auto affineShortlist = [affineOrDot](Expr x, Expr W, Expr b, bool , bool ) { //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; From 560bdbdfdca6ca9446e95419bc5a1be3047ce385 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 20:18:51 +0000 Subject: [PATCH 035/254] warnings --- src/translator/beam_search.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index e3d536286..eb3ecab80 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -99,7 +99,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(origBatchIdx, prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) origBatchIdx, (int) prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -120,7 +120,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap(origBatchIdx, prevBeamHypIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap((int) origBatchIdx, (int) prevBeamHypIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); From 82db7abf8bd238451c119f85e0e0d46bfdbff825 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 20:50:20 +0000 Subject: [PATCH 036/254] start using only expr --- src/data/shortlist.cpp | 8 ++++---- src/layers/logits.cpp | 20 ++++++++++++-------- src/layers/logits.h | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 081ad3426..8ee13585d 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -92,7 +92,7 @@ void Shortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { - std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; + //std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; int batchSize = indicesExprBC->shape()[0]; int currBeamSize = indicesExprBC->shape()[1]; //int numHypos = batchSize * currBeamSize; @@ -142,13 +142,13 @@ LSHShortlist::LSHShortlist(int k, int nbits) #define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { - std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; + //std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; int currBeamSize = indicesExpr_->shape()[1]; //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - std::cerr << "indices_=" << indices_.size() << std::endl; + //std::cerr << "indices_=" << indices_.size() << std::endl; idx = (k_ * currBeamSize) * batchIdx + k_ * beamIdx + idx; - std::cerr << "idx=" << idx << std::endl; + //std::cerr << "idx=" << idx << std::endl; assert(idx < indices_.size()); return indices_[idx]; } diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index d97f860ac..12eaa86f7 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -102,17 +102,14 @@ Expr Logits::getFactoredLogits(size_t groupIndex, } else { //std::cerr << "sel=" << sel->shape() << std::endl; - int currBeamSize = sel->shape()[0]; - int batchSize = sel->shape()[2]; - - auto forward = [this, g, currBeamSize, batchSize](Expr out, const std::vector& inputs) { - std::vector indices; + auto forward = [this, g](Expr out, const std::vector& inputs) { Expr lastIndices = inputs[0]; - lastIndices->val()->get(indices); - std::vector masks = getFactorMasks2(batchSize, currBeamSize, g, indices); + std::vector masks = getFactorMasks2(g, lastIndices); out->val()->set(masks); }; + int currBeamSize = sel->shape()[0]; + int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); @@ -240,8 +237,15 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< return res; } -std::vector Logits::getFactorMasks2(int , int currBeamSize, size_t factorGroup, const std::vector& indices) +std::vector Logits::getFactorMasks2(size_t factorGroup, Expr indicesExpr) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 + std::cerr << "indicesExpr=" << indicesExpr->shape() << std::endl; + //int batchSize + int currBeamSize = indicesExpr->shape()[1]; + std::vector indices; + indicesExpr->val()->get(indices); + + std::cerr << "indices=" << indices.size() << std::endl; size_t n = indices.empty() ? (factoredVocab_->getGroupRange(0).second - factoredVocab_->getGroupRange(0).first) diff --git a/src/layers/logits.h b/src/layers/logits.h index 1c93926d3..362e275bc 100644 --- a/src/layers/logits.h +++ b/src/layers/logits.h @@ -80,7 +80,7 @@ class Logits { } // actually the same as constant(data) for this data type std::vector getFactorMasks(size_t factorGroup, const std::vector& indices) const; - std::vector getFactorMasks2(int batchSize, int currBeamSize, size_t factorGroup, const std::vector& indices) const; + std::vector getFactorMasks2(size_t factorGroup, Expr indicesExpr) const; private: // members From 86d7e30254906d3bac3f9a8126509a58e1ad44cb Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 21:36:05 +0000 Subject: [PATCH 037/254] getFactorMasksMultiDim --- src/layers/logits.cpp | 17 ++++++++++------- src/layers/logits.h | 2 +- src/layers/output.cpp | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 12eaa86f7..26fd607bc 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -104,7 +104,7 @@ Expr Logits::getFactoredLogits(size_t groupIndex, //std::cerr << "sel=" << sel->shape() << std::endl; auto forward = [this, g](Expr out, const std::vector& inputs) { Expr lastIndices = inputs[0]; - std::vector masks = getFactorMasks2(g, lastIndices); + std::vector masks = getFactorMasksMultiDim(g, lastIndices); out->val()->set(masks); }; @@ -113,7 +113,7 @@ Expr Logits::getFactoredLogits(size_t groupIndex, Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; factorMasks = transpose(factorMasks, {1, 0, 2}); //std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; @@ -237,11 +237,13 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< return res; } -std::vector Logits::getFactorMasks2(size_t factorGroup, Expr indicesExpr) +std::vector Logits::getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 std::cerr << "indicesExpr=" << indicesExpr->shape() << std::endl; //int batchSize + int batchSize = indicesExpr->shape()[0]; int currBeamSize = indicesExpr->shape()[1]; + int numHypos = batchSize * currBeamSize; std::vector indices; indicesExpr->val()->get(indices); @@ -249,15 +251,16 @@ std::vector Logits::getFactorMasks2(size_t factorGroup, Expr indicesExpr) size_t n = indices.empty() ? (factoredVocab_->getGroupRange(0).second - factoredVocab_->getGroupRange(0).first) - : indices.size() / currBeamSize; + : indices.size() / numHypos; std::vector res; - res.reserve(currBeamSize * n); + res.reserve(numHypos * n); + std::cerr << "n=" << n << std::endl; // @TODO: we should rearrange lemmaHasFactorGroup as vector[groups[i] of float; then move this // into FactoredVocab - for (size_t currBeam = 0; currBeam < currBeamSize; ++currBeam) { + for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { for(size_t i = 0; i < n; i++) { - size_t idx = currBeam * n + i; + size_t idx = hypoIdx * n + i; size_t lemma = indices.empty() ? i : (indices[idx] - factoredVocab_->getGroupRange(0).first); res.push_back((float)factoredVocab_->lemmaHasFactorGroup(lemma, factorGroup)); } diff --git a/src/layers/logits.h b/src/layers/logits.h index 362e275bc..21d72d2a8 100644 --- a/src/layers/logits.h +++ b/src/layers/logits.h @@ -80,7 +80,7 @@ class Logits { } // actually the same as constant(data) for this data type std::vector getFactorMasks(size_t factorGroup, const std::vector& indices) const; - std::vector getFactorMasks2(size_t factorGroup, Expr indicesExpr) const; + std::vector getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) const; private: // members diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 6b62a9cd1..f704704f1 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -62,7 +62,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return dot(x, W, transA, transB); }; - auto affineShortlist = [affineOrDot](Expr x, Expr W, Expr b, bool , bool ) { + auto affineShortlist = [](Expr x, Expr W, Expr b, bool , bool ) { //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; From 077734331fc3e30477f3841af36f1caa7dcbeaa2 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 21:55:49 +0000 Subject: [PATCH 038/254] debug --- src/data/shortlist.cpp | 1 - src/data/shortlist.h | 1 - src/layers/logits.cpp | 8 ++++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 8ee13585d..8ccfe6abf 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -23,7 +23,6 @@ Shortlist::Shortlist(const std::vector& indices) Shortlist::~Shortlist() {} -const std::vector& Shortlist::indices() const { return indices_; } WordIndex Shortlist::reverseMap(int , int , int idx) const { return indices_[idx]; } WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index d61c90fa0..ff30bb695 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -36,7 +36,6 @@ class Shortlist { Expr lemmaEt, Expr indicesExprBC, int k); - const std::vector& indices() const; public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 26fd607bc..d25b20460 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -113,7 +113,7 @@ Expr Logits::getFactoredLogits(size_t groupIndex, Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; factorMasks = transpose(factorMasks, {1, 0, 2}); //std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; @@ -239,7 +239,7 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< std::vector Logits::getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 - std::cerr << "indicesExpr=" << indicesExpr->shape() << std::endl; + //std::cerr << "indicesExpr=" << indicesExpr->shape() << std::endl; //int batchSize int batchSize = indicesExpr->shape()[0]; int currBeamSize = indicesExpr->shape()[1]; @@ -247,14 +247,14 @@ std::vector Logits::getFactorMasksMultiDim(size_t factorGroup, Expr indic std::vector indices; indicesExpr->val()->get(indices); - std::cerr << "indices=" << indices.size() << std::endl; + //std::cerr << "indices=" << indices.size() << std::endl; size_t n = indices.empty() ? (factoredVocab_->getGroupRange(0).second - factoredVocab_->getGroupRange(0).first) : indices.size() / numHypos; std::vector res; res.reserve(numHypos * n); - std::cerr << "n=" << n << std::endl; + //std::cerr << "n=" << n << std::endl; // @TODO: we should rearrange lemmaHasFactorGroup as vector[groups[i] of float; then move this // into FactoredVocab From ab2afff23ad16adeafe6b3a99e1179bdea597aa0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 23:10:17 +0000 Subject: [PATCH 039/254] old-style iter. For gcc 5 --- src/data/shortlist.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 8ccfe6abf..e26ce95e2 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -203,7 +203,8 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, distances.data(), ids.data()); indices_.clear(); - for(auto id : ids) { + for(auto iter = ids.begin(); iter != ids.end(); ++iter) { + faiss::Index::idx_t id = *iter; indices_.push_back((WordIndex)id); } From 4500221cfca2ac8baddca5f6406006138f910a7d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 23:50:14 +0000 Subject: [PATCH 040/254] don't define BLAS_FOUND --- src/data/shortlist.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index e26ce95e2..b8e70d42d 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -138,7 +138,7 @@ LSHShortlist::LSHShortlist(int k, int nbits) */ } -#define BLAS_FOUND 1 +//#define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { //std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; From 7faebf77caf52e423036460e23c66ac5b236632e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 30 Apr 2021 23:59:05 +0000 Subject: [PATCH 041/254] use args --- src/data/shortlist.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index b8e70d42d..5dffc645c 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -225,7 +225,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExpr_, k_); #else - query; values; + input; weights; isLegacyUntransposedW; b; lemmaEt; ABORT("LSH output layer requires a CPU BLAS library"); #endif } From 379212b75c67b04962f084b7190f024b12dd0068 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 4 May 2021 12:36:10 +0100 Subject: [PATCH 042/254] Enable compute86 where supported (#863) * Enable compute86 where supported --- CHANGELOG.md | 1 + CMakeLists.txt | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b03a07060..40bc72ee7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Compute 8.6 support if using CUDA>=11.1 - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. - Extend suppression of unwanted output symbols, specifically "\n" from default vocabulary if generated by SentencePiece with byte-fallback. Deactivates with --allow-special - Allow for fine-grained CPU intrinsics overrides when BUILD_ARCH != native e.g. -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off diff --git a/CMakeLists.txt b/CMakeLists.txt index 79c8585e9..119bc01f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -325,6 +325,16 @@ if(CUDA_FOUND) option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON) LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets) endif() + if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1") + option(COMPILE_KEPLER "Compile GPU version with SM35 support" OFF) # deprecated for CUDA 11 + option(COMPILE_MAXWELL "Compile GPU version with SM50 support" OFF) # deprecated for CUDA 11 + option(COMPILE_PASCAL "Compile GPU version with SM60 support" ON) + option(COMPILE_VOLTA "Compile GPU version with SM70 support" ON) + option(COMPILE_TURING "Compile GPU version with SM75 support" ON) + option(COMPILE_AMPERE "Compile GPU version with SM80 support" ON) + option(COMPILE_AMPERE_RTX "Compile GPU version with SM86 support" ON) + LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets) + endif() if(COMPILE_KEPLER) message(STATUS "Compiling code for Kepler GPUs") @@ -354,6 +364,12 @@ if(CUDA_FOUND) LIST(APPEND COMPUTE -gencode=arch=compute_80,code=sm_80; -gencode=arch=compute_80,code=compute_80) # Ampere GPUs endif(COMPILE_AMPERE) endif() + if(CUDA_VERSION VERSION_EQUAL "11.1" OR CUDA_VERSION VERSION_GREATER "11.1") + if(COMPILE_AMPERE_RTX) + message(STATUS "Compiling code for Ampere RTX GPUs") + LIST(APPEND COMPUTE -gencode=arch=compute_86,code=sm_86; -gencode=arch=compute_86,code=compute_86) # Ampere RTX GPUs + endif(COMPILE_AMPERE_RTX) + endif() if(USE_STATIC_LIBS) set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY}) From fe74576dc3e957f97b4842951fb6ee7ec52fa324 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 4 May 2021 12:36:37 +0100 Subject: [PATCH 043/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index de0d73ce0..c1cadea12 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.19 +v1.10.20 From 8b818b7c07084281fdac550ae34dc1511df7aed9 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 17 May 2021 13:25:13 -0700 Subject: [PATCH 044/254] Avoid Ampere misaligment issue --- CHANGELOG.md | 1 + regression-tests | 2 +- src/tensors/gpu/prod.cpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f41b8d12..51de104a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Broken links to MNIST data sets ### Changed +- Set REQUIRED_BIAS_ALIGNMENT = 16 in tensors/gpu/prod.cpp to avoid memory-misalignment on certain Ampere GPUs. - For BUILD_ARCH != native enable all intrinsics types by default, can be disabled like this: -DCOMPILE_AVX512=off - Moved FBGEMM pointer to commit c258054 for gcc 9.3+ fix - Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL, diff --git a/regression-tests b/regression-tests index 1afd4eb10..7d612ca5e 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 1afd4eb1014ac451c6a3d6f9b5d34c322902e624 +Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0 diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index 8cfa78cab..4b49c704e 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -22,7 +22,7 @@ namespace gpu { // It seems that the bias must be 8 byte aligned for the cublasLt epilogue to work. Therefore, // if the bias pointer is not 8 byte aligned, we do a normal matmul in cublasLt and invoke a // custom epilogue kernel. -static constexpr int REQUIRED_BIAS_ALIGNMENT = 8; +static constexpr int REQUIRED_BIAS_ALIGNMENT = 16; // @TODO: MJD: changed this to 16 to avoid alignment error on A100. Seems to work fine. // Used to set preferences for cublasLt to filter out algos if matrices to not meet default 256 byte alignment int getAlignmentUpTo256(const void *ptr) { From 9fa166be885b025711f27b35453e0f2c00c9933e Mon Sep 17 00:00:00 2001 From: Young Jin Kim Date: Mon, 24 May 2021 22:51:37 -0700 Subject: [PATCH 045/254] Online quantization (#847) * Enable on-line packing/quantization * Add half precision min/max quantization for model weights * Change default quantization of B matrix to min/max, revert a false commit for AggregateAll * Fixed missing half quantization * Fix quantization range for A * Set all default values for the quantize range to 0.f * Use 7 bits clip for the weight matrix quantization to avoid an overflow of VPMADDUBSW --- src/common/config_parser.cpp | 18 ++++++ src/graph/expression_operators.cpp | 89 ++++++++++++++++++++++++-- src/layers/generic.h | 2 + src/tensors/backend.h | 21 ++++++ src/tensors/cpu/backend.h | 24 +++++++ src/tensors/cpu/fbgemm/expanded_gemm.h | 64 ++++++++++++------ src/tensors/cpu/fbgemm/packed_gemm.cpp | 68 ++++++++++---------- src/tensors/cpu/fbgemm/packed_gemm.h | 3 +- src/tensors/gpu/backend.h | 30 +++++++++ src/translator/translator.h | 10 +++ 10 files changed, 270 insertions(+), 59 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index f29b36307..870bf52d5 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); + // parameters for on-line quantization + cli.add("--optimize", + "Optimize the graph on-the-fly", false); + cli.add("--gemm-type,-g", + "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); + cli.add("--quantize-range", + "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", + 0.f); + #if 0 // @TODO: Ask Hany if there are any decoding-time options // add ULR settings addSuboptionsULR(cli); @@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { "Mixed precision for inference, set parameter type in expression graph", {"float32"}); + // parameters for on-line quantization + cli.add("--optimize", + "Optimize the graph on-the-fly", false); + cli.add("--gemm-type,-g", + "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); + cli.add("--quantize-range", + "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", + 0.f); + cli.switchGroup(previous_group); // clang-format on } diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 6c7ef91ce..e4a4b0899 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // --optimize --cpu-thread=N with N > 0 are set. if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - return Expression(a, b, transA, transB, scale); + if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed || + a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) { +#if USE_FBGEMM + if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) { + auto packedB = cpu::variant::pack( + marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB); + return cpu::variant::dot(marian::Type::packed16, + a, packedB, b->shape(), transA, transB, scale); + } else { + float quantizeRange = b->graph()->getBackend()->getQuantizeRange(); + if(fbgemm::fbgemmHasAvx512Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx512, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::dot(marian::Type::packed8avx512, + a, packedB, b->shape(), transA, transB, scale); + } else if(fbgemm::fbgemmHasAvx2Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx2, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::dot(marian::Type::packed8avx2, + a, packedB, b->shape(), transA, transB, scale); + } else { + ABORT( + "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed " + "GEMM"); + } + } +#else + ABORT("Packed GEMM is not available in this build"); +#endif // USE_FBGEMM + } else { + return Expression( + a, b, transA, transB, scale); + } } else if(isFloat(aElementType) && isIntgemm(bElementType)) { return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { @@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::dot(a, + return cpu::variant::dot(b->value_type(), + a, b, b->shape(), transA, @@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - return affineDefault(a, b, bias, transA, transB, scale); + if(a->graph()->getBackend()->isOptimized()) { + if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed || + a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) { +#if USE_FBGEMM + if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) { + auto packedB = cpu::variant::pack( + marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB); + return cpu::variant::affine(marian::Type::packed16, + a, packedB, b->shape(), bias, transA, transB, scale); + } else { + float quantizeRange = b->graph()->getBackend()->getQuantizeRange(); + if(fbgemm::fbgemmHasAvx512Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx512, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::affine(marian::Type::packed8avx512, + a, packedB, b->shape(), bias, transA, transB, scale); + } else if(fbgemm::fbgemmHasAvx2Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx2, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::affine(marian::Type::packed8avx2, + a, packedB, b->shape(), bias, transA, transB, scale); + } else { + ABORT( + "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed " + "GEMM"); + } + } +#else + ABORT("Packed GEMM is not available in this build"); +#endif // USE_FBGEMM + } else { + return affineDefault(a, b, bias, transA, transB, scale); + } + } else { + return affineDefault(a, b, bias, transA, transB, scale); + } } else if(isFloat(aElementType) && isIntgemm(bElementType)) { return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { @@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::affine(a, + return cpu::variant::affine(b->value_type(), + a, b, b->shape(), bias, diff --git a/src/layers/generic.h b/src/layers/generic.h index 8f390bd7d..9af033df5 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -177,6 +177,8 @@ static inline std::function activationByName(const std::string& actN return (ActivationFunction*)swish; else if (actName == "gelu") return (ActivationFunction*)gelu; + else if (actName == "sigmoid") + return (ActivationFunction*)sigmoid; else if (actName == "") // return identity function if activation name is empty return [](Expr x) { return x; }; ABORT("Invalid activation name '{}'", actName); diff --git a/src/tensors/backend.h b/src/tensors/backend.h index 160b828d3..e0e93039e 100644 --- a/src/tensors/backend.h +++ b/src/tensors/backend.h @@ -5,6 +5,14 @@ namespace marian { +// GEMM type enum +typedef enum { + Auto = 0, // auto tuning between available GEMMs + Float32 = 1, // MKL based GEMM, fp32 + FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing + FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing +} GemmType; + class Backend { protected: DeviceId deviceId_; @@ -21,6 +29,19 @@ class Backend { // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name. virtual void setDevice() = 0; virtual void synchronize() = 0; + + // for CPU, sets to use optimized code for inference. + // for GPU, this is invalid. for gpu, isOptimized() function always returns false. + virtual void setOptimized(bool optimize) = 0; + virtual bool isOptimized() = 0; + // for CPU, selects different GEMM types for the inference. + // for GPU, there's no gemm type. so, it does nothing. + virtual void setGemmType(std::string gemmType) = 0; + virtual GemmType getGemmType() = 0; + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + virtual void setQuantizeRange(float range) = 0; + virtual float getQuantizeRange() = 0; }; Ptr BackendByDeviceId(DeviceId deviceId, size_t seed); diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 398e24240..f52ff6a33 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -10,10 +10,34 @@ namespace marian { namespace cpu { class Backend : public marian::Backend { +protected: + bool optimized_{false}; + GemmType gemmType_{GemmType::Float32}; + float quantizeRange_{0.f}; + public: Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {} void setDevice() override {} void synchronize() override {} + + // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU. + void setOptimized(bool optimize) override { optimized_ = optimize; } + bool isOptimized() override { return optimized_; } + // for CPU only, selects different GEMM types for the inference. Does nothing for GPU. + void setGemmType(std::string gemmType) override { + if (gemmType == "auto") gemmType_ = GemmType::Auto; + else if (gemmType == "float32") gemmType_ = GemmType::Float32; +#if USE_FBGEMM + else if (gemmType == "packed16") gemmType_ = GemmType::FbFp16Packed; + else if (gemmType.find("packed8") == 0) gemmType_ = GemmType::FbInt8Packed; +#endif // USE_FBGEMM + else ABORT("Unknown GEMM type - '{}'", gemmType); + } + GemmType getGemmType() override { return gemmType_; } + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + void setQuantizeRange(float range) override { quantizeRange_ = range; } + float getQuantizeRange() override { return quantizeRange_; } }; } // namespace cpu diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h index fb07bbad5..2c376d6e2 100644 --- a/src/tensors/cpu/fbgemm/expanded_gemm.h +++ b/src/tensors/cpu/fbgemm/expanded_gemm.h @@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { int nrow_; int ncol_; uint64_t packsize_; + float quantizeRange_; FbgemmPacked8PackNodeOp(Expr a, PackMatrix packMat, marian::Type packType, - bool transpose) - : UnaryNodeOp(a, newShape(a, transpose), Type::uint8), + bool transpose, + float quantizeRange) + : UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8), packMat_(packMat), packType_(packType), - transpose_(transpose) { + transpose_(transpose), + quantizeRange_(quantizeRange){ if(packMat != PackMatrix::B) ABORT("Only prepacking of B (weight matrix) is supported"); if(!memoize_) @@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { transpose_, nrow_, ncol_, - packsize_)) + packsize_, + quantizeRange_)) }; #else // USE_FBGEMM ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled."); @@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { const std::string type() override { return "packMatInt8"; } #if USE_FBGEMM - Shape newShape(Expr a, bool transpose) { - fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_); + Shape newShape(Expr a, marian::Type packType, bool transpose) { + fbgemmPacked8PackInfo( + a->shape(), + packType, + transpose, + nrow_, + ncol_, + packsize_); Shape outShape({(int)packsize_}); return outShape; } #else - Shape newShape(Expr /*a*/, bool /*transpose*/) { + Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) { ABORT("Packed GEMM requires a build with USE_FBGEMM enabled"); return Shape(); } @@ -282,10 +292,17 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { size_t k_; bool transA_; bool transB_; + Type elementType_; public: - FbgemmPacked8AffineNodeOp(const std::vector& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/) - : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ { + FbgemmPacked8AffineNodeOp(Type elementType, + const std::vector& nodes, + Shape bShape, + bool transA, + bool transB, + float /*scalar*/) + : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32), + elementType_(elementType) { transA_ = transA; transB_ = transB; m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1]; @@ -324,7 +341,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { #if USE_FBGEMM // Do addBias only if it has a bias term if (children().size() > 2) { - nodeOps = { NodeOp(fbgemmPacked8Gemm(val_, + nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_, + val_, child(0)->val(), child(1)->val(), m_, @@ -334,7 +352,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { transB_); marian::cpu::integer::AddBias(val_, child(2)->val())) }; } else { - nodeOps = { NodeOp(fbgemmPacked8Gemm(val_, + nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_, + val_, child(0)->val(), child(1)->val(), m_, @@ -358,39 +377,46 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { const std::string type() override { return "gemmPacked8"; } }; -static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) { +static inline Expr affine(Type elementType, + Expr a, + Expr b, + Shape bShape, + Expr c, + bool transA, + bool transB, + float scalar) { std::vector nodes = {a, b, c}; - Type elementType = b->value_type(); if (elementType == Type::packed16) return Expression(nodes, bShape, transA, transB, scalar); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(nodes, bShape, transA, transB, scalar); + return Expression( + elementType, nodes, bShape, transA, transB, scalar); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; } } -static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) { +static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) { if (elementType == Type::packed16) return Expression(a, packMat, transpose); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(a, packMat, elementType, transpose); + return Expression(a, packMat, elementType, transpose, quantizeRange); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; } } -static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) { +static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) { std::vector nodes = {a, b}; - Type elementType = b->value_type(); if (elementType == Type::packed16) return Expression(nodes, bShape, transA, transB, scalar); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(nodes, bShape, transA, transB, scalar); + return Expression( + elementType, nodes, bShape, transA, transB, scalar); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp index 65dca1f70..dd81d0f7f 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.cpp +++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp @@ -360,10 +360,10 @@ void fbgemmPacked8Pack(marian::Tensor out, const float* data = inData; float val = 0; - - // Use half of the quantization range to prevent overflow of VPMADDUBSW - constexpr static int quantizedRange = 127; - constexpr static int quantizedMax = 63; + + // Use half of the quantization range to prevent overflow of VPMADDUBSW + constexpr static int quantizedRange = 127; + constexpr static int quantizedMax = 63; // This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range. for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.) @@ -371,32 +371,32 @@ void fbgemmPacked8Pack(marian::Tensor out, double mean = 0, sqrSum = 0; for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats val = getVal2dArr(data, ii, jj, k, n, transpose); - // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range - if(quantRangeStdDevs == 0.f) { - if(min > val) - min = val; - if(max < val) - max = val; - } else { - // Quantize by std.dev. range - mean += val; - sqrSum += val * val; - } - } - // If a quantization range (in multiples of std. dev.) is given with a non-zero value, - // it calculate the range for this column (different quantization scale/offset are used for each column) - if(quantRangeStdDevs != 0.f) { - mean /= k; - sqrSum /= k; - sqrSum -= mean * mean; - sqrSum = sqrt(sqrSum); - min = (float)(mean - quantRangeStdDevs * sqrSum); - max = (float)(mean + quantRangeStdDevs * sqrSum); - } - // based on the quantization range, this computes the scale and offset for the quantization - quantScaleB[jj] = (max - min) / quantizedRange; - quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]); - } + // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range + if(quantRangeStdDevs == 0.f) { + if(min > val) + min = val; + if(max < val) + max = val; + } else { + // Quantize by std.dev. range + mean += val; + sqrSum += val * val; + } + } + // If a quantization range (in multiples of std. dev.) is given with a non-zero value, + // it calculate the range for this column (different quantization scale/offset are used for each column) + if(quantRangeStdDevs != 0.f) { + mean /= k; + sqrSum /= k; + sqrSum -= mean * mean; + sqrSum = sqrt(sqrSum); + min = (float)(mean - quantRangeStdDevs * sqrSum); + max = (float)(mean + quantRangeStdDevs * sqrSum); + } + // based on the quantization range, this computes the scale and offset for the quantization + quantScaleB[jj] = (max - min) / quantizedRange; + quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]); + } // 2. quantize int8_t* quantized = 0; @@ -410,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out, TensorQuantizationParams bQuantParam; bQuantParam.scale = quantScaleB[jj]; bQuantParam.zero_point = quantZeropointB[jj]; - bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW + bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW if (transpose) fbgemm::Quantize(data + jj * k, quantized + jj * k, k, bQuantParam); @@ -536,7 +536,8 @@ void fbgemmPacked16Gemm(marian::Tensor C, // k: the number of columns in A and the number of rows in B // transA: whether A matrix is transposed or not // transB: whether B matrix is transposed or not -void fbgemmPacked8Gemm(marian::Tensor C, +void fbgemmPacked8Gemm(Type packType, + marian::Tensor C, const marian::Tensor A, const marian::Tensor B, const size_t m, @@ -544,9 +545,6 @@ void fbgemmPacked8Gemm(marian::Tensor C, const size_t k, const int transA, const int transB) { - // pack type - marian::Type packType = B->type(); - const fbgemm::BlockingFactors* params = getBlockingFactors(packType); // Check if the packed format matches with the available AVX instruction set in the machine diff --git a/src/tensors/cpu/fbgemm/packed_gemm.h b/src/tensors/cpu/fbgemm/packed_gemm.h index 694860d48..e5740a434 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.h +++ b/src/tensors/cpu/fbgemm/packed_gemm.h @@ -135,7 +135,8 @@ void fbgemmPacked16Gemm(marian::Tensor C, // k: the number of columns in A and rows in B // transA: transpose of A matrix // transB: transpose of B matrix -void fbgemmPacked8Gemm(marian::Tensor C, +void fbgemmPacked8Gemm(Type packType, + marian::Tensor C, const marian::Tensor A, const marian::Tensor B, const size_t m, diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 75cc604da..410b41a49 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -64,6 +64,36 @@ class Backend : public marian::Backend { return cusparseHandle_; } + // for CPU, sets to use optimized code for inference. + // for GPU, this is invalid. for gpu, isOptimized() function always returns false. + void setOptimized(bool optimize) override { + LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize); + } + bool isOptimized() override { + LOG_ONCE(info, "isOptimized() not supported for GPU"); + return false; + }; + + // for CPU, selects different GEMM types for the inference. + // for GPU, there's no gemm type. so, it does nothing. + void setGemmType(std::string gemmType) override { + LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType); + } + GemmType getGemmType() override { + LOG_ONCE(info, "getGemmType() not supported for GPU"); + return GemmType::Float32; + } + + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + void setQuantizeRange(float range) override { + LOG_ONCE(info, "setQuantizeRange() not supported for GPU_{}", range); + } + float getQuantizeRange() override { + LOG_ONCE(info, "getQuantizeRange() not supported for GPU"); + return 0.f; + } + CudaCompute getCudaComputeCapability() { return compute_; } private: diff --git a/src/translator/translator.h b/src/translator/translator.h index fe01065b6..579f126de 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -89,6 +89,11 @@ class Translate : public ModelTask { auto prec = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(prec[0])); graph->setDevice(device); + if (device.type == DeviceType::cpu) { + graph->getBackend()->setOptimized(options_->get("optimize")); + graph->getBackend()->setGemmType(options_->get("gemm-type")); + graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); + } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; @@ -282,6 +287,11 @@ class TranslateService : public ModelServiceTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); + if (device.type == DeviceType::cpu) { + graph->getBackend()->setOptimized(options_->get("optimize")); + graph->getBackend()->setGemmType(options_->get("gemm-type")); + graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); + } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); From 6e87f16e481dce956081ce5e7a0f8e243c4c9522 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Wed, 26 May 2021 06:12:33 +0000 Subject: [PATCH 046/254] Merged PR 18763: Fix adding new validation metrics with --valid-reset-stalled This fixes a bug that's been discovered recently by checking if a validator exists before resetting its stalled validations. Regression test for it is in: https://github.com/marian-nmt/marian-regression-tests/pull/80 --- CHANGELOG.md | 1 + src/training/scheduler.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 569c607d6..2f4141c84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Compute aligned memory sizes using exact sizing ### Fixed +- Adding new validation metrics when training is restarted and --reset-valid-stalled is used - Missing depth-scaling in transformer FFN - Fixed an issue when loading intgemm16 models from unaligned memory. - Fix building marian with gcc 9.3+ and FBGEMM diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 8d4fa30ca..3cc3b2076 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -511,7 +511,8 @@ class Scheduler : public TrainingObserver { state_->stalled = 0; state_->maxStalled = 0; for(const auto& validator : validators_) { - state_->validators[validator->type()]["stalled"] = 0; + if(state_->validators[validator->type()]) + state_->validators[validator->type()]["stalled"] = 0; } } From 2c1b16f43e141498dbd4cd5133956567d132bd01 Mon Sep 17 00:00:00 2001 From: Rohit Jain Date: Fri, 4 Jun 2021 10:13:00 +0000 Subject: [PATCH 047/254] Merged PR 19252: Update sentencepiece module to include CMake changes Update SPM module to include CMake changes. --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 3c9326012..6f24a6b52 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 3c93260124d39613e31d63443dbb594197e69607 +Subproject commit 6f24a6b52a521a3467e99a9c175ba9e136905217 From 28e5e2260aa44b54e476b06028d0c3592b7010ff Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 4 Jun 2021 13:39:03 -0700 Subject: [PATCH 048/254] filter once for shortlist --- src/data/shortlist.cpp | 8 +++++++- src/data/shortlist.h | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 5dffc645c..3e07983af 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -19,7 +19,8 @@ const T* get(const void*& current, size_t num = 1) { ////////////////////////////////////////////////////////////////////////////////////// Shortlist::Shortlist(const std::vector& indices) - : indices_(indices) {} + : indices_(indices) + , done_(false) {} Shortlist::~Shortlist() {} @@ -34,6 +35,10 @@ WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { + if (done_) { + return; + } + //if (indicesExpr_) return; int currBeamSize = input->shape()[0]; int batchSize = input->shape()[2]; @@ -50,6 +55,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Expr indicesExprBC = getIndicesExpr(batchSize, currBeamSize); broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExprBC, k); + done_ = true; } Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index ff30bb695..e353cdd9e 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -29,7 +29,8 @@ class Shortlist { Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; - + bool done_; + virtual void broadcast(Expr weights, bool isLegacyUntransposedW, Expr b, From f19ebbae69e06d85979ac16b76fb1acf0dc4e695 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 4 Jun 2021 14:21:26 -0700 Subject: [PATCH 049/254] debug --- src/layers/output.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index f704704f1..9975416b0 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -261,7 +261,16 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; //std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; Expr e = factorSoftmax * cachedShortLemmaEt; - //std::cerr << "e.1=" << e->shape() << std::endl; + /* + factorSoftmax= beam x 1 x batch x vocab + cachedShortLemmaEt= 1 x 10 x 1 x vocab + e= beam x 10 x batch x vocab + + std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; + std::cerr << "e=" << e->shape() << std::endl; + std::cerr << std::endl; + */ e = sum(e, 3); //std::cerr << "e.2=" << e->shape() << std::endl; e = transpose(e, {0, 3, 2, 1}); From 77c0cac1f21f255e7f78bbf7b0d2b138afb3743a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Jun 2021 09:14:39 -0700 Subject: [PATCH 050/254] broadcasting bdot --- src/graph/node_operators_binary.h | 24 ++++++++-- src/tensors/cpu/prod.cpp | 71 +++++++++++++++++++++++------- src/tensors/gpu/prod.cpp | 62 +++++++++++++++++++------- src/tests/units/operator_tests.cpp | 60 +++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 36 deletions(-) diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 91fc29da2..bd52103a9 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -529,11 +529,27 @@ class DotBatchedNodeOp : public NaryNodeOp { shapeB.set(-1, b->shape()[-2]); } - Shape outShape = shapeA; - outShape.set(-1, shapeB[-1]); ABORT_IF(shapeA[-1] != shapeB[-2], - "Batched matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB); - return outShape; + "Batched matrix product requires inner dimensions to match in {}{} * {}{}", + std::string(shapeA), transA, std::string(shapeB), transB); + + // create shapes for batch dimensions only + auto shapeBatchA = shapeA; + shapeBatchA.set(-1, 1); + shapeBatchA.set(-2, 1); + + auto shapeBatchB = shapeB; + shapeBatchB.set(-1, 1); + shapeBatchB.set(-2, 1); + + // broadcast batch dimensions + auto shapeOut = Shape::broadcast({shapeBatchA, shapeBatchB}); + + // set non-batch dimensions in output + shapeOut.set(-1, shapeA[-2]); + shapeOut.set(-2, shapeB[-1]); + + return shapeOut; } NodeOps forwardOps() override { diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 6e28bdd23..066867e4f 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -93,31 +93,58 @@ void ProdBatched(marian::Tensor C, #if BLAS_FOUND float alpha = scalar; - size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); - size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); + // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements + // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. + + auto aShape = A->shape(); + auto bShape = B->shape(); + + // make sure both shape have the same number of dimensions via broadcasting + size_t maxLength = std::max(aShape.size(), bShape.size()); + if(aShape.size() != bShape.size()) { + Shape ones(std::vector(maxLength, 1)); + aShape = Shape::broadcast({aShape, ones}); + bShape = Shape::broadcast({bShape, ones}); + } + + // Create meta-shapes without last 2 dimensions + Shape aShapeMeta, bShapeMeta, cShapeMeta; + aShapeMeta.resize(maxLength - 2); + bShapeMeta.resize(maxLength - 2); + for(size_t i = 0; i < maxLength - 2; ++i) { + aShapeMeta.set(i, aShape[i]); + bShapeMeta.set(i, bShape[i]); + } + cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); - size_t m = A->shape()[-2]; - size_t k = A->shape()[-1]; + size_t m = aShape[-2]; + size_t k = aShape[-1]; if(transA) std::swap(m, k); - size_t l = B->shape()[-2]; - size_t n = B->shape()[-1]; + size_t l = bShape[-2]; + size_t n = bShape[-1]; if(transB) std::swap(l, n); - size_t lda = A->shape()[-1]; - size_t ldb = B->shape()[-1]; - size_t ldc = B->shape()[-1]; + size_t lda = aShape[-1]; + size_t ldb = bShape[-1]; + size_t ldc = bShape[-1]; if(transB) - ldc = B->shape()[-2]; + ldc = bShape[-2]; - auto strideB = batchB == 1 ? 0 : n * k; - auto strideA = batchA == 1 ? 0 : m * k; + auto strideA = m * k; + auto strideB = n * k; auto strideC = n * m; - auto batchC = std::max(batchA, batchB); + auto batchC = cShapeMeta.elements(); + + // Convert to functional shapes to be able to map dimensions. @TODO merge this + functional::Shape aShapeMetaF = aShapeMeta; + functional::Shape bShapeMetaF = bShapeMeta; + functional::Shape cShapeMetaF = cShapeMeta; + #if MKL_FOUND CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; CBLAS_TRANSPOSE transB_forarr = CblasNoTrans; @@ -156,9 +183,14 @@ void ProdBatched(marian::Tensor C, // This loop initializes the array pointers in the same way as the for loop // in the normal sgemm version a few lines below + functional::Array dims; for(size_t i = 0; i < batchC; ++i) { - a_array[i] = A->data() + (i % batchA) * strideA; - b_array[i] = B->data() + (i % batchB) * strideB; + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + + a_array[i] = A->data() + aIndex * strideA; + b_array[i] = B->data() + bIndex * strideB; c_array[i] = C->data() + i * strideC; } cblas_sgemm_batch (CblasRowMajor, @@ -178,16 +210,21 @@ void ProdBatched(marian::Tensor C, group_count, &group_size[0]); #else + functional::Array dims; for(size_t i = 0; i < batchC; ++i) { + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + sgemm(transA, transB, (int)m, (int)n, (int)k, alpha, - A->data() + (i % batchA) * strideA, + A->data() + aIndex * strideA, (int)lda, - B->data() + (i % batchB) * strideB, + B->data() + bIndex * strideB, (int)ldb, beta, C->data() + i * strideC, diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index 4b49c704e..3e35237f0 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -347,25 +347,46 @@ void ProdBatchedTyped(marian::Tensor C, CUDA_CHECK(cudaSetDevice((int)C->getDeviceId().no)); ComputeType alpha = scalar; - int batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); - int batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); + // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements + // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. + + auto aShape = A->shape(); + auto bShape = B->shape(); + + // make sure both shape have the same number of dimensions via broadcasting + size_t maxLength = std::max(aShape.size(), bShape.size()); + if(aShape.size() != bShape.size()) { + Shape ones(std::vector(maxLength, 1)); + aShape = Shape::broadcast({aShape, ones}); + bShape = Shape::broadcast({bShape, ones}); + } + + // Create meta-shapes without last 2 dimensions + Shape aShapeMeta, bShapeMeta, cShapeMeta; + aShapeMeta.resize(maxLength - 2); + bShapeMeta.resize(maxLength - 2); + for(size_t i = 0; i < maxLength - 2; ++i) { + aShapeMeta.set(i, aShape[i]); + bShapeMeta.set(i, bShape[i]); + } + cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); - int m = A->shape()[-2]; - int k = A->shape()[-1]; + size_t m = aShape[-2]; + size_t k = aShape[-1]; if(transA) std::swap(m, k); - int l = B->shape()[-2]; - int n = B->shape()[-1]; + size_t l = bShape[-2]; + size_t n = bShape[-1]; if(transB) std::swap(l, n); - int lda = A->shape()[-1]; - int ldb = B->shape()[-1]; - int ldc = B->shape()[-1]; + size_t lda = aShape[-1]; + size_t ldb = bShape[-1]; + size_t ldc = bShape[-1]; if(transB) - ldc = B->shape()[-2]; + ldc = bShape[-2]; cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -374,18 +395,29 @@ void ProdBatchedTyped(marian::Tensor C, auto cublasHandle = backend->getCublasHandle(); auto compute = backend->getCudaComputeCapability(); - auto strideA = batchA == 1 ? 0 : m * k; - auto strideB = batchB == 1 ? 0 : n * k; + auto strideA = m * k; + auto strideB = n * k; auto strideC = n * m; - auto batchC = std::max(batchA, batchB); + + auto batchC = cShapeMeta.elements(); + + // Convert to functional shapes to be able to map dimensions. @TODO merge this + functional::Shape aShapeMetaF = aShapeMeta; + functional::Shape bShapeMetaF = bShapeMeta; + functional::Shape cShapeMetaF = cShapeMeta; std::vector aptr; std::vector bptr; std::vector cptr; + functional::Array dims; for(int i = 0; i < batchC; i++) { - aptr.push_back(A->data() + (i % batchA) * strideA); - bptr.push_back(B->data() + (i % batchB) * strideB); + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + + aptr.push_back(A->data() + aIndex * strideA); + bptr.push_back(B->data() + bIndex * strideB); cptr.push_back(C->data() + i * strideC); } diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index 1a18da999..f3b5fda34 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -615,6 +615,66 @@ void tests(DeviceType device, Type floatType = Type::float32) { CHECK(values2 == values); } + SECTION("bdot") { + graph->clear(); + values.clear(); + + std::vector vA({ 1, 2, + 3, 4, + 5, 6, + 7, 8}); + + std::vector vB({ 1, 2, + 3, 4, + 5, 6, + 7, 8, + 9, 10, + 11, 12}); + + std::vector vC({ 7, 10, + 15, 22, + 19, 22, + 43, 50, + 31, 34, + 71, 78, + 23, 34, + 31, 46, + 67, 78, + 91, 106, + 111, 122, + 151, 166}); + + std::vector vCt({ 5, 11, + 11, 25, + 17, 23, + 39, 53, + 29, 35, + 67, 81, + 17, 39, + 23, 53, + 61, 83, + 83, 113, + 105, 127, + 143, 173}); + + auto A = graph->param("A", {2, 1, 2, 2}, inits::fromVector(vA)); + auto B = graph->param("B", {1, 3, 2, 2}, inits::fromVector(vB)); + + auto C = bdot(A, B, /*transA=*/false, /*transB=*/false); + auto Ct = bdot(A, B, /*transA=*/false, /*transB=*/true); + + graph->forward(); + + CHECK(C->shape() == Shape({2, 3, 2, 2})); + CHECK(Ct->shape() == Shape({2, 3, 2, 2})); + + C->val()->get(values); + CHECK(vC == values); + + Ct->val()->get(values); + CHECK(vCt == values); + } + SECTION("repeat") { graph->clear(); values.clear(); From 2e6f0293ab8720e98324dae97538f2912bbca259 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Jun 2021 11:23:03 -0700 Subject: [PATCH 051/254] add legacy bdot --- src/graph/expression_operators.cpp | 4 + src/graph/expression_operators.h | 6 ++ src/graph/node_operators_binary.h | 154 +++++++++++++++++++++++++++- src/models/transformer.h | 4 +- src/tensors/cpu/prod.cpp | 156 +++++++++++++++++++++++++++++ src/tensors/gpu/prod.cpp | 134 +++++++++++++++++++++++++ src/tensors/tensor_operators.h | 1 + 7 files changed, 455 insertions(+), 4 deletions(-) diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 6c7ef91ce..baec94dfd 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -519,6 +519,10 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) { return Expression(a, b, transA, transB, scale); } +Expr bdot_legacy(Expr a, Expr b, bool transA, bool transB, float scale) { + return Expression(a, b, transA, transB, scale); +} + Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // general version, MKL, CBlas or CUDA diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index f3d84eb6b..c1570effe 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -478,6 +478,12 @@ Expr bdot(Expr a, bool transB = false, float scalar = 1.f); +Expr bdot_legacy(Expr a, + Expr b, + bool transA = false, + bool transB = false, + float scalar = 1.f); + /** * Performs an affine transformation. * Computes diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index bd52103a9..169b1420b 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -546,8 +546,8 @@ class DotBatchedNodeOp : public NaryNodeOp { auto shapeOut = Shape::broadcast({shapeBatchA, shapeBatchB}); // set non-batch dimensions in output - shapeOut.set(-1, shapeA[-2]); - shapeOut.set(-2, shapeB[-1]); + shapeOut.set(-2, shapeA[-2]); + shapeOut.set(-1, shapeB[-1]); return shapeOut; } @@ -671,6 +671,156 @@ class DotBatchedNodeOp : public NaryNodeOp { const std::string color() override { return "orange"; } }; +class DotBatchedLegacyNodeOp : public NaryNodeOp { +private: + friend class SerializationHelpers; + bool transA_; + bool transB_; + float scalar_; + +public: + DotBatchedLegacyNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar) + : NaryNodeOp({a, b}, newShape(a, b, transA, transB)), + transA_(transA), + transB_(transB), + scalar_(scalar) {} + + Shape newShape(Expr a, Expr b, bool transA, bool transB) { + auto shapeA = a->shape(); + if(transA) { + shapeA.set(-2, a->shape()[-1]); + shapeA.set(-1, a->shape()[-2]); + } + + auto shapeB = b->shape(); + if(transB) { + shapeB.set(-2, b->shape()[-1]); + shapeB.set(-1, b->shape()[-2]); + } + + Shape outShape = shapeA; + outShape.set(-1, shapeB[-1]); + ABORT_IF(shapeA[-1] != shapeB[-2], + "Batched matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB); + return outShape; + } + + NodeOps forwardOps() override { + // C = alpha * dot(op(A), op(B)) + return {NodeOp(ProdBatchedLegacy(val_, + graph()->allocator(), + child(0)->val(), + child(1)->val(), + transA_, + transB_, + 0.f, + scalar_))}; + } + + NodeOps backwardOps() override { + // D is the adjoint, the matrix of derivatives + // df/dA += alpha * dot(D, op(B).T) + // df/dB += alpha * dot(op(A).T, D) + // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C + // to sum gradients from different graph parts + + if(!transA_ && transB_) + return {NodeOp(ProdBatchedLegacy(child(0)->grad(), + graph()->allocator(), + adj_, + child(1)->val(), + false, + false, + 1.0, + scalar_)), + NodeOp(ProdBatchedLegacy(child(1)->grad(), + graph()->allocator(), + adj_, + child(0)->val(), + true, + false, + 1.0, + scalar_))}; + if(transA_ && !transB_) + return {NodeOp(ProdBatchedLegacy(child(0)->grad(), + graph()->allocator(), + child(1)->val(), + adj_, + false, + true, + 1.0, + scalar_)), + NodeOp(ProdBatchedLegacy(child(1)->grad(), + graph()->allocator(), + child(0)->val(), + adj_, + false, + false, + 1.0, + scalar_))}; + if(transA_ && transB_) + return {NodeOp(ProdBatchedLegacy(child(0)->grad(), + graph()->allocator(), + child(1)->val(), + adj_, + true, + true, + 1.0, + scalar_)), + NodeOp(ProdBatchedLegacy(child(1)->grad(), + graph()->allocator(), + adj_, + child(0)->val(), + true, + true, + 1.0, + scalar_))}; + return {NodeOp(ProdBatchedLegacy(child(0)->grad(), + graph()->allocator(), + adj_, + child(1)->val(), + false, + true, + 1.0, + scalar_)), + NodeOp(ProdBatchedLegacy(child(1)->grad(), + graph()->allocator(), + child(0)->val(), + adj_, + true, + false, + 1.0, + scalar_))}; + } + + const std::string type() override { return "bdot_legacy"; } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, transA_); + util::hash_combine(seed, transB_); + util::hash_combine(seed, scalar_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transA_ != cnode->transA_) + return false; + if(transB_ != cnode->transB_) + return false; + if(scalar_ != cnode->scalar_) + return false; + return true; + } + + const std::string color() override { return "orange"; } +}; + // Note: To reduce code duplication, we use the same NodeOp for C = op(S) x D and C = D x op(S). // Set swapOperands to select the latter. class CSRDotNodeOp : public NaryNodeOp { diff --git a/src/models/transformer.h b/src/models/transformer.h index 1da02318e..a792de8ba 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -249,7 +249,7 @@ class Transformer : public EncoderOrDecoderBase { // multiplicative attention with flattened softmax float scale = 1.0f / std::sqrt((float)dk); // scaling to avoid extreme values due to matrix multiplication - auto z = bdot(q, k, false, true, scale); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] + auto z = bdot_legacy(q, k, false, true, scale); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] // mask out garbage beyond end of sequences z = z + mask; @@ -264,7 +264,7 @@ class Transformer : public EncoderOrDecoderBase { weights = dropout(weights, inference_ ? 0 : opt("transformer-dropout-attention")); // apply attention weights to values - auto output = bdot(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] + auto output = bdot_legacy(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] return output; } diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 066867e4f..6e28158a6 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -237,6 +237,162 @@ void ProdBatched(marian::Tensor C, #endif } + +void ProdBatchedLegacy(marian::Tensor C, + Ptr /*allocator*/, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { +#if BLAS_FOUND + float alpha = scalar; + + // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements + // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. + + auto aShape = A->shape(); + auto bShape = B->shape(); + + // make sure both shape have the same number of dimensions via broadcasting + size_t maxLength = std::max(aShape.size(), bShape.size()); + if(aShape.size() != bShape.size()) { + Shape ones(std::vector(maxLength, 1)); + aShape = Shape::broadcast({aShape, ones}); + bShape = Shape::broadcast({bShape, ones}); + } + + // Create meta-shapes without last 2 dimensions + Shape aShapeMeta, bShapeMeta, cShapeMeta; + aShapeMeta.resize(maxLength - 2); + bShapeMeta.resize(maxLength - 2); + for(size_t i = 0; i < maxLength - 2; ++i) { + aShapeMeta.set(i, aShape[i]); + bShapeMeta.set(i, bShape[i]); + } + cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); + + size_t m = aShape[-2]; + size_t k = aShape[-1]; + if(transA) + std::swap(m, k); + + size_t l = bShape[-2]; + size_t n = bShape[-1]; + if(transB) + std::swap(l, n); + + size_t lda = aShape[-1]; + size_t ldb = bShape[-1]; + size_t ldc = bShape[-1]; + + if(transB) + ldc = bShape[-2]; + + auto strideA = m * k; + auto strideB = n * k; + auto strideC = n * m; + + auto batchC = cShapeMeta.elements(); + + // Convert to functional shapes to be able to map dimensions. @TODO merge this + functional::Shape aShapeMetaF = aShapeMeta; + functional::Shape bShapeMetaF = bShapeMeta; + functional::Shape cShapeMetaF = cShapeMeta; + +#if MKL_FOUND + CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; + CBLAS_TRANSPOSE transB_forarr = CblasNoTrans; + + if(transA) + transA_forarr = CblasTrans; + + if(transB) + transB_forarr = CblasTrans; + + /* cblas_sgemm_batch allows us to group all the small GEMMs that are done in a for loop with sgemm and compute + * them in only one MKL call. For the API documentation refer to + * https://software.intel.com/content/www/us/en/develop/documentation/mkl-developer-reference-c/top/blas-and-sparse-blas-routines/blas-like-extensions/cblas-gemm-batch.html + * The API supports dependencies, where you can specify one "group" of GEMMs to be computed after another. (This controlled by the group_count parameter). + * In our case, the operations are not dependent on one another so we hardcode one group. The rest of the arguments (with the exception of group_size) are + * the same as the ones that cblas_sgemm expects, with the difference that we are supposed to provide an array pointer (One element per group). + * Weirdly enough, we are required to to provide all of the integer arguments as the MKL_INT datatype + */ + + static const constexpr size_t group_count = 1; // We have one group + const std::vector transa_arr(group_count, transA_forarr); + const std::vector transb_arr(group_count, transB_forarr); + const std::vector m_arr(group_count, (MKL_INT)m); + const std::vector n_arr(group_count, (MKL_INT)n); + const std::vector k_arr(group_count, (MKL_INT)k); + const std::vector alpha_arr(group_count, alpha); + const std::vector beta_arr(group_count, beta); + const std::vector lda_arr(group_count, (MKL_INT)lda); + const std::vector ldb_arr(group_count, (MKL_INT)ldb); + const std::vector ldc_arr(group_count, (MKL_INT)ldc); + const std::vector group_size(group_count, (MKL_INT)batchC); // Group size specifies number of GEMM operations per group (Which is batchC) + + std::vector a_array(batchC, nullptr); + std::vector b_array(batchC, nullptr); + std::vector c_array(batchC, nullptr); + + // This loop initializes the array pointers in the same way as the for loop + // in the normal sgemm version a few lines below + functional::Array dims; + for(size_t i = 0; i < batchC; ++i) { + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + + a_array[i] = A->data() + aIndex * strideA; + b_array[i] = B->data() + bIndex * strideB; + c_array[i] = C->data() + i * strideC; + } + cblas_sgemm_batch (CblasRowMajor, + &transa_arr[0], + &transb_arr[0], + &m_arr[0], + &n_arr[0], + &k_arr[0], + &alpha_arr[0], + &a_array[0], + &lda_arr[0], + &b_array[0], + &ldb_arr[0], + &beta_arr[0], + &c_array[0], + &ldc_arr[0], + group_count, + &group_size[0]); +#else + functional::Array dims; + for(size_t i = 0; i < batchC; ++i) { + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + + sgemm(transA, + transB, + (int)m, + (int)n, + (int)k, + alpha, + A->data() + aIndex * strideA, + (int)lda, + B->data() + bIndex * strideB, + (int)ldb, + beta, + C->data() + i * strideC, + (int)ldc); + } +#endif +#else + C; A; B; transA; transB; beta; scalar; + ABORT("You need to compile with MKL in order to use the CPU version"); +#endif +} + void ProdWithBias(marian::Tensor C, const marian::Tensor& A, const marian::Tensor& B, diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index 3e35237f0..7840b4aa3 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -468,6 +468,140 @@ void ProdBatched(marian::Tensor C, } } +template +void ProdBatchedTypedLegacy(marian::Tensor C, + Ptr allocator, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + ComputeType beta, + ComputeType scalar) { + CUDA_CHECK(cudaSetDevice((int)C->getDeviceId().no)); + ComputeType alpha = scalar; + + // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements + // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. + + auto aShape = A->shape(); + auto bShape = B->shape(); + + // make sure both shape have the same number of dimensions via broadcasting + size_t maxLength = std::max(aShape.size(), bShape.size()); + if(aShape.size() != bShape.size()) { + Shape ones(std::vector(maxLength, 1)); + aShape = Shape::broadcast({aShape, ones}); + bShape = Shape::broadcast({bShape, ones}); + } + + // Create meta-shapes without last 2 dimensions + Shape aShapeMeta, bShapeMeta, cShapeMeta; + aShapeMeta.resize(maxLength - 2); + bShapeMeta.resize(maxLength - 2); + for(size_t i = 0; i < maxLength - 2; ++i) { + aShapeMeta.set(i, aShape[i]); + bShapeMeta.set(i, bShape[i]); + } + cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); + + size_t m = aShape[-2]; + size_t k = aShape[-1]; + if(transA) + std::swap(m, k); + + size_t l = bShape[-2]; + size_t n = bShape[-1]; + if(transB) + std::swap(l, n); + + size_t lda = aShape[-1]; + size_t ldb = bShape[-1]; + size_t ldc = bShape[-1]; + + if(transB) + ldc = bShape[-2]; + + cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + auto backend = std::static_pointer_cast(C->getBackend()); + auto cublasHandle = backend->getCublasHandle(); + auto compute = backend->getCudaComputeCapability(); + + auto strideA = m * k; + auto strideB = n * k; + auto strideC = n * m; + + auto batchC = cShapeMeta.elements(); + + // Convert to functional shapes to be able to map dimensions. @TODO merge this + functional::Shape aShapeMetaF = aShapeMeta; + functional::Shape bShapeMetaF = bShapeMeta; + functional::Shape cShapeMetaF = cShapeMeta; + + std::vector aptr; + std::vector bptr; + std::vector cptr; + + functional::Array dims; + for(int i = 0; i < batchC; i++) { + cShapeMetaF.dims(i, dims); + auto aIndex = aShapeMetaF.bindex(dims); + auto bIndex = bShapeMetaF.bindex(dims); + + aptr.push_back(A->data() + aIndex * strideA); + bptr.push_back(B->data() + bIndex * strideB); + cptr.push_back(C->data() + i * strideC); + } + + // auto fails here from weird reason + IPtr mp_aptr = allocator->alloc(aptr.size()); + CudaCopy(aptr.data(), aptr.data() + aptr.size(), mp_aptr->data()); + + IPtr mp_bptr = allocator->alloc(bptr.size()); + CudaCopy(bptr.data(), bptr.data() + bptr.size(), mp_bptr->data()); + + IPtr mp_cptr = allocator->alloc(cptr.size()); + CudaCopy(cptr.data(), cptr.data() + cptr.size(), mp_cptr->data()); + + setTensorMode(cublasHandle); + TypedGemm::batchedGemm(cublasHandle, compute, + opB, opA, + n, m, k, + &alpha, + mp_bptr->data(), ldb, + mp_aptr->data(), lda, + &beta, + mp_cptr->data(), ldc, + batchC); + unsetTensorMode(cublasHandle); + + allocator->free(mp_aptr); + allocator->free(mp_bptr); + allocator->free(mp_cptr); +} + +// @TODO: add version with compute type for completeness +void ProdBatchedLegacy(marian::Tensor C, + Ptr allocator, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { + if(C->type() == Type::float32) { + ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); +#if COMPILE_FP16 + } else if(C->type() == Type::float16) { // not a *.cu file + ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); +#endif + } else { + ABORT("ProdBatchedLegacy not implemented for element type {}", C->type()); + } +} + + #if CUDA_VERSION >= 11000 // Earlier versions of cublasLT do not support bias addition for fp32 and fp16. static cublasStatus_t cublasLtAffineHelper(cublasLtHandle_t ltHandle, cublasOperation_t transA, cublasOperation_t transB, diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index ef4850683..6e587953c 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -104,6 +104,7 @@ DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bo DISPATCH8(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float, Type) // overloading since we want the default to for computeType be C->type() which difficult otherwise. DISPATCH8(ProdBatched, marian::Tensor, Ptr, const marian::Tensor, const marian::Tensor, bool, bool, float, float) +DISPATCH8(ProdBatchedLegacy, marian::Tensor, Ptr, const marian::Tensor, const marian::Tensor, bool, bool, float, float) DISPATCH9(CSRProd, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float) DISPATCH10(Affine, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float, bool) From 1d96d7b6eb84ab89893869d0f4bba429932ae3cd Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Jun 2021 11:24:31 -0700 Subject: [PATCH 052/254] add legacy code on cpu --- src/tensors/cpu/prod.cpp | 71 ++++++++++------------------------------ 1 file changed, 17 insertions(+), 54 deletions(-) diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 6e28158a6..07cc2b99e 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -249,58 +249,31 @@ void ProdBatchedLegacy(marian::Tensor C, #if BLAS_FOUND float alpha = scalar; - // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements - // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. - - auto aShape = A->shape(); - auto bShape = B->shape(); - - // make sure both shape have the same number of dimensions via broadcasting - size_t maxLength = std::max(aShape.size(), bShape.size()); - if(aShape.size() != bShape.size()) { - Shape ones(std::vector(maxLength, 1)); - aShape = Shape::broadcast({aShape, ones}); - bShape = Shape::broadcast({bShape, ones}); - } + size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); + size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); - // Create meta-shapes without last 2 dimensions - Shape aShapeMeta, bShapeMeta, cShapeMeta; - aShapeMeta.resize(maxLength - 2); - bShapeMeta.resize(maxLength - 2); - for(size_t i = 0; i < maxLength - 2; ++i) { - aShapeMeta.set(i, aShape[i]); - bShapeMeta.set(i, bShape[i]); - } - cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); - - size_t m = aShape[-2]; - size_t k = aShape[-1]; + size_t m = A->shape()[-2]; + size_t k = A->shape()[-1]; if(transA) std::swap(m, k); - size_t l = bShape[-2]; - size_t n = bShape[-1]; + size_t l = B->shape()[-2]; + size_t n = B->shape()[-1]; if(transB) std::swap(l, n); - size_t lda = aShape[-1]; - size_t ldb = bShape[-1]; - size_t ldc = bShape[-1]; + size_t lda = A->shape()[-1]; + size_t ldb = B->shape()[-1]; + size_t ldc = B->shape()[-1]; if(transB) - ldc = bShape[-2]; + ldc = B->shape()[-2]; - auto strideA = m * k; - auto strideB = n * k; + auto strideB = batchB == 1 ? 0 : n * k; + auto strideA = batchA == 1 ? 0 : m * k; auto strideC = n * m; - auto batchC = cShapeMeta.elements(); - - // Convert to functional shapes to be able to map dimensions. @TODO merge this - functional::Shape aShapeMetaF = aShapeMeta; - functional::Shape bShapeMetaF = bShapeMeta; - functional::Shape cShapeMetaF = cShapeMeta; - + auto batchC = std::max(batchA, batchB); #if MKL_FOUND CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; CBLAS_TRANSPOSE transB_forarr = CblasNoTrans; @@ -339,14 +312,9 @@ void ProdBatchedLegacy(marian::Tensor C, // This loop initializes the array pointers in the same way as the for loop // in the normal sgemm version a few lines below - functional::Array dims; for(size_t i = 0; i < batchC; ++i) { - cShapeMetaF.dims(i, dims); - auto aIndex = aShapeMetaF.bindex(dims); - auto bIndex = bShapeMetaF.bindex(dims); - - a_array[i] = A->data() + aIndex * strideA; - b_array[i] = B->data() + bIndex * strideB; + a_array[i] = A->data() + (i % batchA) * strideA; + b_array[i] = B->data() + (i % batchB) * strideB; c_array[i] = C->data() + i * strideC; } cblas_sgemm_batch (CblasRowMajor, @@ -366,21 +334,16 @@ void ProdBatchedLegacy(marian::Tensor C, group_count, &group_size[0]); #else - functional::Array dims; for(size_t i = 0; i < batchC; ++i) { - cShapeMetaF.dims(i, dims); - auto aIndex = aShapeMetaF.bindex(dims); - auto bIndex = bShapeMetaF.bindex(dims); - sgemm(transA, transB, (int)m, (int)n, (int)k, alpha, - A->data() + aIndex * strideA, + A->data() + (i % batchA) * strideA, (int)lda, - B->data() + bIndex * strideB, + B->data() + (i % batchB) * strideB, (int)ldb, beta, C->data() + i * strideC, From ce34df4d985d3ff86e4babfc9529c4aaa0aba57d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Jun 2021 11:25:40 -0700 Subject: [PATCH 053/254] add legacy code on gpu --- src/tensors/gpu/prod.cpp | 62 ++++++++++------------------------------ 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index 7840b4aa3..e996f58f2 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -480,46 +480,25 @@ void ProdBatchedTypedLegacy(marian::Tensor C, CUDA_CHECK(cudaSetDevice((int)C->getDeviceId().no)); ComputeType alpha = scalar; - // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements - // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. - - auto aShape = A->shape(); - auto bShape = B->shape(); - - // make sure both shape have the same number of dimensions via broadcasting - size_t maxLength = std::max(aShape.size(), bShape.size()); - if(aShape.size() != bShape.size()) { - Shape ones(std::vector(maxLength, 1)); - aShape = Shape::broadcast({aShape, ones}); - bShape = Shape::broadcast({bShape, ones}); - } - - // Create meta-shapes without last 2 dimensions - Shape aShapeMeta, bShapeMeta, cShapeMeta; - aShapeMeta.resize(maxLength - 2); - bShapeMeta.resize(maxLength - 2); - for(size_t i = 0; i < maxLength - 2; ++i) { - aShapeMeta.set(i, aShape[i]); - bShapeMeta.set(i, bShape[i]); - } - cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); + int batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); + int batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); - size_t m = aShape[-2]; - size_t k = aShape[-1]; + int m = A->shape()[-2]; + int k = A->shape()[-1]; if(transA) std::swap(m, k); - size_t l = bShape[-2]; - size_t n = bShape[-1]; + int l = B->shape()[-2]; + int n = B->shape()[-1]; if(transB) std::swap(l, n); - size_t lda = aShape[-1]; - size_t ldb = bShape[-1]; - size_t ldc = bShape[-1]; + int lda = A->shape()[-1]; + int ldb = B->shape()[-1]; + int ldc = B->shape()[-1]; if(transB) - ldc = bShape[-2]; + ldc = B->shape()[-2]; cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -528,29 +507,18 @@ void ProdBatchedTypedLegacy(marian::Tensor C, auto cublasHandle = backend->getCublasHandle(); auto compute = backend->getCudaComputeCapability(); - auto strideA = m * k; - auto strideB = n * k; + auto strideA = batchA == 1 ? 0 : m * k; + auto strideB = batchB == 1 ? 0 : n * k; auto strideC = n * m; - - auto batchC = cShapeMeta.elements(); - - // Convert to functional shapes to be able to map dimensions. @TODO merge this - functional::Shape aShapeMetaF = aShapeMeta; - functional::Shape bShapeMetaF = bShapeMeta; - functional::Shape cShapeMetaF = cShapeMeta; + auto batchC = std::max(batchA, batchB); std::vector aptr; std::vector bptr; std::vector cptr; - functional::Array dims; for(int i = 0; i < batchC; i++) { - cShapeMetaF.dims(i, dims); - auto aIndex = aShapeMetaF.bindex(dims); - auto bIndex = bShapeMetaF.bindex(dims); - - aptr.push_back(A->data() + aIndex * strideA); - bptr.push_back(B->data() + bIndex * strideB); + aptr.push_back(A->data() + (i % batchA) * strideA); + bptr.push_back(B->data() + (i % batchB) * strideB); cptr.push_back(C->data() + i * strideC); } From 0949a4c914e718f83b4965f625b506ac75d1b0e0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 7 Jun 2021 15:05:56 -0700 Subject: [PATCH 054/254] start using bdot --- src/layers/output.cpp | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 9975416b0..871b0df2a 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -66,13 +66,17 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; - + /* Expr ret = x * W; ret = sum(ret, 3); //const Shape &retShape = ret->shape(); //std::cerr << "ret.1=" << retShape << std::endl; ret = transpose(ret, {0, 3, 2, 1}); - //ret = reshape(ret, {retShape[0], 1, 1, retShape[2]}); + */ + x = transpose(x, {0, 2, 1, 3}); + W = transpose(W, {0, 2, 1, 3}); + Expr ret = bdot(x, W, false, true); + //std::cerr << "ret.2=" << ret->shape() << std::endl; return ret; }; @@ -258,10 +262,10 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { const Shape &s = lemmaEt_->shape(); cachedShortLemmaEt = reshape(lemmaEt_, {1, s[0], 1, s[1]}); } - //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - //std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; - Expr e = factorSoftmax * cachedShortLemmaEt; + std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; /* + Expr e = factorSoftmax * cachedShortLemmaEt; factorSoftmax= beam x 1 x batch x vocab cachedShortLemmaEt= 1 x 10 x 1 x vocab e= beam x 10 x batch x vocab @@ -270,11 +274,21 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; std::cerr << "e=" << e->shape() << std::endl; std::cerr << std::endl; - */ e = sum(e, 3); //std::cerr << "e.2=" << e->shape() << std::endl; e = transpose(e, {0, 3, 2, 1}); - //std::cerr << "e.3=" << e->shape() << std::endl; + */ + factorSoftmax = transpose(factorSoftmax, {0, 2, 1, 3}); + cachedShortLemmaEt = transpose(cachedShortLemmaEt, {0, 2, 1, 3}); + std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; + + Expr e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); + std::cerr << "e.1=" << e->shape() << std::endl; + const Shape &eShape = e->shape(); + e = reshape(e, {eShape[0], 1, eShape[1], eShape[3]}); + std::cerr << "e.3=" << e->shape() << std::endl; + std::cerr << std::endl; // project it back to regular hidden dim int inputDim = input1->shape()[-1]; From acdff7768804bda9f5174e8f206b19f209152824 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 7 Jun 2021 15:29:20 -0700 Subject: [PATCH 055/254] reduce tranform for no-shortlist --- src/layers/output.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 871b0df2a..1315cf316 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -75,9 +75,12 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { */ x = transpose(x, {0, 2, 1, 3}); W = transpose(W, {0, 2, 1, 3}); + //std::cerr << "x=" << x->shape() << std::endl; + //std::cerr << "W=" << W->shape() << std::endl; Expr ret = bdot(x, W, false, true); //std::cerr << "ret.2=" << ret->shape() << std::endl; + //std::cerr << std::endl; return ret; }; @@ -256,14 +259,17 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { #endif // re-embedding lookup, soft-indexed by softmax Expr cachedShortLemmaEt; - if(shortlist_) // short-listed version of re-embedding matrix + if(shortlist_) { // short-listed version of re-embedding matrix cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); + cachedShortLemmaEt = transpose(cachedShortLemmaEt, {0, 2, 1, 3}); + } else { const Shape &s = lemmaEt_->shape(); - cachedShortLemmaEt = reshape(lemmaEt_, {1, s[0], 1, s[1]}); + std::cerr << "lemmaEt_=" << lemmaEt_->shape() << std::endl; + cachedShortLemmaEt = reshape(lemmaEt_, {1, 1, s[0], s[1]}); } std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; + std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; /* Expr e = factorSoftmax * cachedShortLemmaEt; factorSoftmax= beam x 1 x batch x vocab @@ -279,9 +285,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { e = transpose(e, {0, 3, 2, 1}); */ factorSoftmax = transpose(factorSoftmax, {0, 2, 1, 3}); - cachedShortLemmaEt = transpose(cachedShortLemmaEt, {0, 2, 1, 3}); std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; Expr e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); std::cerr << "e.1=" << e->shape() << std::endl; From b5f97dc6051164757863b925806cdd5ef356f7be Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 7 Jun 2021 15:35:18 -0700 Subject: [PATCH 056/254] reshape cachedShortLemmaEt --- src/data/shortlist.cpp | 2 +- src/layers/output.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 3e07983af..496b9ecbe 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -126,7 +126,7 @@ void Shortlist::broadcast(Expr weights, //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 0, 1, 3}); + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 1, 0, 3}); //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 1315cf316..a2719dc24 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -261,7 +261,6 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { Expr cachedShortLemmaEt; if(shortlist_) { // short-listed version of re-embedding matrix cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); - cachedShortLemmaEt = transpose(cachedShortLemmaEt, {0, 2, 1, 3}); } else { const Shape &s = lemmaEt_->shape(); From eb3f540d4260361968ce63b1fdc758121c618382 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 7 Jun 2021 15:37:22 -0700 Subject: [PATCH 057/254] debug --- src/layers/output.cpp | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index a2719dc24..9cdda4308 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -264,34 +264,20 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } else { const Shape &s = lemmaEt_->shape(); - std::cerr << "lemmaEt_=" << lemmaEt_->shape() << std::endl; + //std::cerr << "lemmaEt_=" << lemmaEt_->shape() << std::endl; cachedShortLemmaEt = reshape(lemmaEt_, {1, 1, s[0], s[1]}); } - std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; - /* - Expr e = factorSoftmax * cachedShortLemmaEt; - factorSoftmax= beam x 1 x batch x vocab - cachedShortLemmaEt= 1 x 10 x 1 x vocab - e= beam x 10 x batch x vocab - - std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; - std::cerr << "e=" << e->shape() << std::endl; - std::cerr << std::endl; - e = sum(e, 3); - //std::cerr << "e.2=" << e->shape() << std::endl; - e = transpose(e, {0, 3, 2, 1}); - */ + //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; factorSoftmax = transpose(factorSoftmax, {0, 2, 1, 3}); - std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; Expr e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); - std::cerr << "e.1=" << e->shape() << std::endl; + //std::cerr << "e.1=" << e->shape() << std::endl; const Shape &eShape = e->shape(); e = reshape(e, {eShape[0], 1, eShape[1], eShape[3]}); - std::cerr << "e.3=" << e->shape() << std::endl; - std::cerr << std::endl; + //std::cerr << "e.3=" << e->shape() << std::endl; + //std::cerr << std::endl; // project it back to regular hidden dim int inputDim = input1->shape()[-1]; From 92c6c077868a48f45deeaf901c74f633e27319d0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 7 Jun 2021 15:43:54 -0700 Subject: [PATCH 058/254] reshape cachedShortWt_ --- src/data/shortlist.cpp | 2 +- src/layers/output.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 496b9ecbe..7db84cb93 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -113,7 +113,7 @@ void Shortlist::broadcast(Expr weights, //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); //std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; - cachedShortWt_ = transpose(cachedShortWt_, {1, 2, 0, 3}); + cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; if (b) { diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 9cdda4308..4f413272c 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -74,7 +74,6 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { ret = transpose(ret, {0, 3, 2, 1}); */ x = transpose(x, {0, 2, 1, 3}); - W = transpose(W, {0, 2, 1, 3}); //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; Expr ret = bdot(x, W, false, true); From e07e0368c9aa3304f71c5d169348debc9a3c9115 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 8 Jun 2021 18:20:56 -0700 Subject: [PATCH 059/254] debug --- src/layers/output.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 4f413272c..947b5ff80 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -63,23 +63,15 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { }; auto affineShortlist = [](Expr x, Expr W, Expr b, bool , bool ) { - //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "W=" << W->shape() << std::endl; - //std::cerr << "transA=" << transA << " transB=" << transB << std::endl; - /* - Expr ret = x * W; - ret = sum(ret, 3); - //const Shape &retShape = ret->shape(); - //std::cerr << "ret.1=" << retShape << std::endl; - ret = transpose(ret, {0, 3, 2, 1}); - */ + std::cerr << "x=" << x->shape() << std::endl; + std::cerr << "W=" << W->shape() << std::endl; x = transpose(x, {0, 2, 1, 3}); - //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "W=" << W->shape() << std::endl; + std::cerr << "x=" << x->shape() << std::endl; + std::cerr << "W=" << W->shape() << std::endl; Expr ret = bdot(x, W, false, true); - //std::cerr << "ret.2=" << ret->shape() << std::endl; - //std::cerr << std::endl; + std::cerr << "ret.2=" << ret->shape() << std::endl; + std::cerr << std::endl; return ret; }; @@ -182,20 +174,28 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { + //std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; + //std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineShortlist( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits + //std::cerr << "affineShortlist.factorLogits.1=" << factorLogits->shape() << std::endl; + factorLogits = transpose(factorLogits, {0, 2, 1, 3}); + //std::cerr << "affineShortlist.factorLogits.2=" << factorLogits->shape() << std::endl; } else { + //std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; + //std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits + //std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl; } // optionally add lemma-dependent bias @@ -270,6 +270,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { //std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; factorSoftmax = transpose(factorSoftmax, {0, 2, 1, 3}); //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; Expr e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); //std::cerr << "e.1=" << e->shape() << std::endl; From 5d1946ebd3e959c801c960637d3adeac3d838ed5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 20:07:57 +0000 Subject: [PATCH 060/254] filter & broadcast every word. SL works --- src/data/shortlist.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 7db84cb93..c6526430d 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -35,9 +35,9 @@ WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - if (done_) { - return; - } + //if (done_) { + // return; + //} //if (indicesExpr_) return; int currBeamSize = input->shape()[0]; @@ -109,12 +109,14 @@ void Shortlist::broadcast(Expr weights, indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; + std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); - //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); - //std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; + std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); - //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); From 0bc9b22b151d339e21f5ed580a4c619642632e9b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 20:18:19 +0000 Subject: [PATCH 061/254] separate broadcast --- src/data/shortlist.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ src/data/shortlist.h | 7 +++++++ 2 files changed, 48 insertions(+) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index c6526430d..3d479e9de 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -238,6 +238,47 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, #endif } +void LSHShortlist::broadcast(Expr weights, + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + Expr indicesExprBC, + int k) { + //std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; + int batchSize = indicesExprBC->shape()[0]; + int currBeamSize = indicesExprBC->shape()[1]; + //int numHypos = batchSize * currBeamSize; + //std::cerr << "batchSize=" << batchSize << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << std::endl; + //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; + ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); + + indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); + //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + + std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; + std::cerr << "weights=" << weights->shape() << std::endl; + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); + std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); + std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); + std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + + if (b) { + ABORT("Bias not yet tested"); + cachedShortb_ = index_select(b, -1, indicesExprBC); + cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested + } + + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); + //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); + //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 1, 0, 3}); + //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; +} + LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) : k_(k), nbits_(nbits) { //std::cerr << "LSHShortlistGenerator" << std::endl; diff --git a/src/data/shortlist.h b/src/data/shortlist.h index e353cdd9e..d003a68cd 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -74,6 +74,13 @@ class LSHShortlist: public Shortlist { static Ptr index_; + virtual void broadcast(Expr weights, + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + Expr indicesExprBC, + int k) override; + public: LSHShortlist(int k, int nbits); virtual WordIndex reverseMap(int batchIdx, int beamIdx, int idx) const override; From 79dbde7efcba69c57a2cd4a1cf3bbd92bf199015 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 20:45:40 +0000 Subject: [PATCH 062/254] don't manually broadcast weights --- src/data/shortlist.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 3d479e9de..9c3a01028 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -54,6 +54,8 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); Expr indicesExprBC = getIndicesExpr(batchSize, currBeamSize); + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + std::cerr << "indicesExprBC=" << indicesExprBC->shape() << std::endl; broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExprBC, k); done_ = true; } @@ -111,17 +113,14 @@ void Shortlist::broadcast(Expr weights, std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; std::cerr << "weights=" << weights->shape() << std::endl; - cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_); std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; - cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); - std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; - cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); - std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); if (b) { ABORT("Bias not yet tested"); - cachedShortb_ = index_select(b, -1, indicesExprBC); - cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested + cachedShortb_ = index_select(b, -1, indicesExpr_); + cachedShortb_ = reshape(cachedShortb_, {1, k, 1, cachedShortb_->shape()[1]}); // not tested } cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); From 4b9082bc395cc090cdd73c4d504db45fe1fb025c Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 20:57:02 +0000 Subject: [PATCH 063/254] don't manually broadcast lemma --- src/data/shortlist.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 9c3a01028..0d6297089 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -114,7 +114,7 @@ void Shortlist::broadcast(Expr weights, std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_); - std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); if (b) { @@ -123,12 +123,11 @@ void Shortlist::broadcast(Expr weights, cachedShortb_ = reshape(cachedShortb_, {1, k, 1, cachedShortb_->shape()[1]}); // not tested } - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); - //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 1, 0, 3}); - //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; + std::cerr << "lemmaEt.1_=" << lemmaEt->shape() << std::endl; + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); + std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); + std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; } /////////////////////////////////////////////////////////////////////////////////// From 1e3db86a94481c7dfd1a90355c8b9af145ef9ad6 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 21:56:51 +0000 Subject: [PATCH 064/254] batch based filtering. COmment out debug --- src/data/shortlist.cpp | 38 ++++++++++---------------------------- src/data/shortlist.h | 7 +++---- src/layers/output.cpp | 12 ++++++------ 3 files changed, 19 insertions(+), 38 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 0d6297089..599a3ffab 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -35,15 +35,9 @@ WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - //if (done_) { - // return; - //} - - //if (indicesExpr_) return; - int currBeamSize = input->shape()[0]; - int batchSize = input->shape()[2]; - //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - //std::cerr << "batchSize=" << batchSize << std::endl; + if (done_) { + return; + } auto forward = [this](Expr out, const std::vector& ) { out->val()->set(indices_); @@ -53,10 +47,8 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Shape kShape({k}); indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); - Expr indicesExprBC = getIndicesExpr(batchSize, currBeamSize); - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - std::cerr << "indicesExprBC=" << indicesExprBC->shape() << std::endl; - broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExprBC, k); + //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + broadcast(weights, isLegacyUntransposedW, b, lemmaEt, k); done_ = true; } @@ -97,22 +89,12 @@ void Shortlist::broadcast(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, - Expr indicesExprBC, int k) { - //std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; - int batchSize = indicesExprBC->shape()[0]; - int currBeamSize = indicesExprBC->shape()[1]; - //int numHypos = batchSize * currBeamSize; - //std::cerr << "batchSize=" << batchSize << std::endl; - //std::cerr << "currBeamSize=" << currBeamSize << std::endl; //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); - indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); - //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; - - std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; - std::cerr << "weights=" << weights->shape() << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; + //std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_); //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); @@ -123,11 +105,11 @@ void Shortlist::broadcast(Expr weights, cachedShortb_ = reshape(cachedShortb_, {1, k, 1, cachedShortb_->shape()[1]}); // not tested } - std::cerr << "lemmaEt.1_=" << lemmaEt->shape() << std::endl; + //std::cerr << "lemmaEt.1_=" << lemmaEt->shape() << std::endl; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); - std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); - std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; } /////////////////////////////////////////////////////////////////////////////////// diff --git a/src/data/shortlist.h b/src/data/shortlist.h index d003a68cd..1c1243abe 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -31,11 +31,10 @@ class Shortlist { Expr cachedShortLemmaEt_; bool done_; - virtual void broadcast(Expr weights, + void broadcast(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, - Expr indicesExprBC, int k); public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos @@ -74,12 +73,12 @@ class LSHShortlist: public Shortlist { static Ptr index_; - virtual void broadcast(Expr weights, + void broadcast(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, Expr indicesExprBC, - int k) override; + int k); public: LSHShortlist(int k, int nbits); diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 947b5ff80..21789bbe7 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -63,15 +63,15 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { }; auto affineShortlist = [](Expr x, Expr W, Expr b, bool , bool ) { - std::cerr << "x=" << x->shape() << std::endl; - std::cerr << "W=" << W->shape() << std::endl; + //std::cerr << "x=" << x->shape() << std::endl; + //std::cerr << "W=" << W->shape() << std::endl; x = transpose(x, {0, 2, 1, 3}); - std::cerr << "x=" << x->shape() << std::endl; - std::cerr << "W=" << W->shape() << std::endl; + //std::cerr << "x=" << x->shape() << std::endl; + //std::cerr << "W=" << W->shape() << std::endl; Expr ret = bdot(x, W, false, true); - std::cerr << "ret.2=" << ret->shape() << std::endl; - std::cerr << std::endl; + //std::cerr << "ret.2=" << ret->shape() << std::endl; + //std::cerr << std::endl; return ret; }; From 6f0f534a4aafff35c8922136dc8ec6658407c1fe Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 22:36:34 +0000 Subject: [PATCH 065/254] debug --- src/data/shortlist.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 599a3ffab..064e8fa77 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -129,13 +129,14 @@ LSHShortlist::LSHShortlist(int k, int nbits) //#define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { - //std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; - //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + int currBatchSize = indicesExpr_->shape()[0]; int currBeamSize = indicesExpr_->shape()[1]; - //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - //std::cerr << "indices_=" << indices_.size() << std::endl; + std::cerr << "currBatchSize=" << currBatchSize << " currBeamSize=" << currBeamSize << std::endl; + std::cerr << "indices_=" << indices_.size() << std::endl; idx = (k_ * currBeamSize) * batchIdx + k_ * beamIdx + idx; - //std::cerr << "idx=" << idx << std::endl; + std::cerr << "idx=" << idx << std::endl; assert(idx < indices_.size()); return indices_[idx]; } @@ -236,14 +237,14 @@ void LSHShortlist::broadcast(Expr weights, indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; - std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; - std::cerr << "weights=" << weights->shape() << std::endl; + //std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; + //std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); - std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); - std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; + //std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); - std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); From fe97259d3d2f88beb27668b249beae474d4948bf Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 22:58:25 +0000 Subject: [PATCH 066/254] debug --- src/common/utils.h | 16 ++++++++++++++++ src/data/shortlist.cpp | 2 ++ 2 files changed, 18 insertions(+) diff --git a/src/common/utils.h b/src/common/utils.h index 72994ccfe..d8d387a82 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -2,6 +2,7 @@ #include #include +#include namespace marian { namespace utils { @@ -62,5 +63,20 @@ std::string findReplace(const std::string& in, const std::string& what, const st double parseDouble(std::string s); double parseNumber(std::string s); + +template +void Debug(const T *arr, size_t size, const std::string &str) { + std::cerr << str << ":" << size << ": "; + for (size_t i = 0; i < size; ++i) { + std::cerr << arr[i] << " "; + } + std::cerr << std::endl; +} + +template +void Debug(const std::vector &arr, const std::string &str) { + Debug(arr.data(), arr.size(), str); +} + } // namespace utils } // namespace marian diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 064e8fa77..de189e5d2 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -161,6 +161,8 @@ Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { #if BLAS_FOUND + static int c = 0; + std::cerr << "c=" << c++ << std::endl; ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, "LSH index (--output-approx-knn) currently not implemented for GPU"); From 5a93c6718568a59cc0119663fa2093e69a436d56 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 9 Jun 2021 23:31:06 +0000 Subject: [PATCH 067/254] origBatchIdx -> currentBatchIdx. Doesn't crash but bad results --- src/translator/beam_search.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index eb3ecab80..cefa99376 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -4,6 +4,7 @@ #include "translator/helpers.h" #include "translator/nth_element.h" #include "data/shortlist.h" +#include "common/utils.h" namespace marian { @@ -19,6 +20,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current const std::vector& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use. const std::vector& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx] std::vector align; // collects alignment information from the last executed time step + //utils::Debug(batchIdxMap, "batchIdxMap"); if(options_->hasAndNotEmpty("alignment") && factorGroup == 0) align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble, @@ -99,7 +101,8 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) origBatchIdx, (int) prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + std::cerr << "currentBatchId=" << currentBatchIdx << " origBatchIdx=" << origBatchIdx << std::endl; + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) currentBatchIdx, (int) prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), From fef7202bc809ac1dd31a89a7f6697e110b681d95 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 10 Jun 2021 23:58:25 -0700 Subject: [PATCH 068/254] batch-beam -> beam-batch --- src/data/shortlist.cpp | 49 ++++++++++++++++++++++++------------------ src/layers/logits.cpp | 31 ++++++++++++++++++++------ src/layers/logits.h | 11 ++++------ src/layers/output.cpp | 26 ++++++++++++---------- 4 files changed, 72 insertions(+), 45 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index de189e5d2..305bc9282 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -129,13 +129,18 @@ LSHShortlist::LSHShortlist(int k, int nbits) //#define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { - std::cerr << "\nbatchIdx=" << batchIdx << " beamIdx=" << beamIdx << " idx=" << idx << std::endl; + std::cerr << "\nbatchIdx=" << batchIdx + << " beamIdx=" << beamIdx + << " idx=" << idx + << " k_=" << k_ + << std::endl; std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - int currBatchSize = indicesExpr_->shape()[0]; - int currBeamSize = indicesExpr_->shape()[1]; + int currBeamSize = indicesExpr_->shape()[0]; + int currBatchSize = indicesExpr_->shape()[1]; std::cerr << "currBatchSize=" << currBatchSize << " currBeamSize=" << currBeamSize << std::endl; std::cerr << "indices_=" << indices_.size() << std::endl; - idx = (k_ * currBeamSize) * batchIdx + k_ * beamIdx + idx; + idx = (k_ * currBatchSize * beamIdx) + (k_ * batchIdx) + idx; + //idx = (k_ * currBeamSize * batchIdx) + (k_ * beamIdx) + idx; std::cerr << "idx=" << idx << std::endl; assert(idx < indices_.size()); return indices_[idx]; @@ -152,13 +157,16 @@ WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { } Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { - //std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; - //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; - assert(indicesExpr_->shape()[0] == batchSize); - assert(indicesExpr_->shape()[1] == currBeamSize); - return indicesExpr_; + std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; + assert(indicesExpr_->shape()[0] == currBeamSize); + assert(indicesExpr_->shape()[1] == batchSize); + Expr ret = transpose(indicesExpr_, {1, 0, 2}); + return ret; } +#define BLAS_FOUND 1 + void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { #if BLAS_FOUND static int c = 0; @@ -186,6 +194,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, index_->add( vRows, values->val()->data()); } + std::cerr << "query=" << query->shape() << std::endl; int qRows = query->shape().elements() / dim; std::vector distances(qRows * k_); std::vector ids(qRows * k_); @@ -207,8 +216,8 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, out->val()->set(indices_); }; - Shape kShape({batchSize, currBeamSize, k_}); - //std::cerr << "kShape=" << kShape << std::endl; + Shape kShape({currBeamSize, batchSize, k_}); + std::cerr << "kShape=" << kShape << std::endl; indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; @@ -227,9 +236,9 @@ void LSHShortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { - //std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; - int batchSize = indicesExprBC->shape()[0]; - int currBeamSize = indicesExprBC->shape()[1]; + std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; + int currBeamSize = indicesExprBC->shape()[0]; + int batchSize = indicesExprBC->shape()[1]; //int numHypos = batchSize * currBeamSize; //std::cerr << "batchSize=" << batchSize << std::endl; //std::cerr << "currBeamSize=" << currBeamSize << std::endl; @@ -239,14 +248,12 @@ void LSHShortlist::broadcast(Expr weights, indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; - //std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; - //std::cerr << "weights=" << weights->shape() << std::endl; + std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; + std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); - //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; - cachedShortWt_ = reshape(cachedShortWt_, {batchSize, currBeamSize, k, cachedShortWt_->shape()[1]}); - //std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; - cachedShortWt_ = transpose(cachedShortWt_, {1, 0, 2, 3}); - //std::cerr << "cachedShortWt_.3=" << cachedShortWt_->shape() << std::endl; + std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; + cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); + std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index d25b20460..9eb4c2903 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -8,6 +8,15 @@ Logits::Logits(Expr logits) : Logits(New(logits, nullptr)) { } // single-output constructor from Expr only (RationalLoss has no count) +Logits::Logits(Ptr logits) { // single-output constructor + logits_.push_back(logits); +} + +Logits::Logits(std::vector>&& logits, + Ptr embeddingFactorMapping) // factored-output constructor + : logits_(std::move(logits)), factoredVocab_(embeddingFactorMapping) { +} + Ptr Logits::graph() const { ABORT_IF(logits_.empty(), "Empty logits object??"); return logits_.front()->loss()->graph(); @@ -53,6 +62,7 @@ Expr Logits::applyLossFunction( auto factorIndices = indices(maskedFactoredLabels.indices); // [B... flattened] factor-label indices, or 0 if factor does not apply auto factorMask = constant(maskedFactoredLabels.masks); // [B... flattened] loss values get multiplied with 0 for labels that don't have this factor auto factorLogits = logits_[g]; // [B... * Ug] label-wise loss values (not aggregated yet) + std::cerr << "g=" << g << " factorLogits->loss()=" << factorLogits->loss()->shape() << std::endl; // For each location in [B...] select [indices[B...]]. If not using factor, select [0] and mask it out next. auto factorLoss = lossFn(factorLogits->loss(), factorIndices); // [B... x 1] // clang-format on @@ -85,12 +95,14 @@ Expr Logits::getFactoredLogits(size_t groupIndex, ABORT_IF(empty(), "Attempted to read out logits on empty Logits object"); auto sel = logits_[groupIndex]->loss(); // [localBeamSize, 1, dimBatch, dimFactorVocab] + std::cerr << "sel.1=" << sel->shape() << std::endl; // normalize for decoding: // - all secondary factors: subtract their max // - lemma: add all maxes of applicable factors if(groupIndex > 0) { sel = sel - max(sel, -1); + std::cerr << "sel.2=" << sel->shape() << std::endl; } else { auto numGroups = getNumFactorGroups(); for(size_t g = 1; g < numGroups; g++) { @@ -101,7 +113,7 @@ Expr Logits::getFactoredLogits(size_t groupIndex, factorMasks = constant(getFactorMasks(g, std::vector())); } else { - //std::cerr << "sel=" << sel->shape() << std::endl; + std::cerr << "sel.3=" << sel->shape() << std::endl; auto forward = [this, g](Expr out, const std::vector& inputs) { Expr lastIndices = inputs[0]; std::vector masks = getFactorMasksMultiDim(g, lastIndices); @@ -111,20 +123,27 @@ Expr Logits::getFactoredLogits(size_t groupIndex, int currBeamSize = sel->shape()[0]; int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); - //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; + std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; factorMasks = transpose(factorMasks, {1, 0, 2}); - //std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; + std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; const Shape &s = factorMasks->shape(); factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); - //std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; + std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); + std::cerr << "factorMaxima=" << factorMaxima->shape() << std::endl; factorMasks = cast(factorMasks, sel->value_type()); - sel = sel + factorMaxima * factorMasks; // those lemmas that don't have a factor + std::cerr << "factorMasks.4=" << factorMasks->shape() << std::endl; + + Expr tmp = factorMaxima * factorMasks; + std::cerr << "tmp=" << tmp->shape() << std::endl; + std::cerr << "sel.4=" << sel->shape() << std::endl; + sel = sel + tmp; // those lemmas that don't have a factor // get multiplied with 0 + std::cerr << "sel.5=" << sel->shape() << std::endl; } } diff --git a/src/layers/logits.h b/src/layers/logits.h index 21d72d2a8..1a57657d6 100644 --- a/src/layers/logits.h +++ b/src/layers/logits.h @@ -17,14 +17,11 @@ class RationalLoss; class Logits { public: Logits() {} - explicit Logits(Ptr logits) { // single-output constructor - logits_.push_back(logits); - } - explicit Logits( - Expr logits); // single-output constructor from Expr only (RationalLoss has no count) + explicit Logits(Ptr logits); // single-output constructor + explicit Logits(Expr logits); // single-output constructor from Expr only (RationalLoss has no count) Logits(std::vector>&& logits, - Ptr embeddingFactorMapping) // factored-output constructor - : logits_(std::move(logits)), factoredVocab_(embeddingFactorMapping) {} + Ptr embeddingFactorMapping); // factored-output constructor + Expr getLogits() const; // assume it holds logits: get them, possibly aggregating over factors Expr getFactoredLogits( size_t groupIndex, diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 21789bbe7..2d5df585b 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -63,9 +63,6 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { }; auto affineShortlist = [](Expr x, Expr W, Expr b, bool , bool ) { - //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "W=" << W->shape() << std::endl; - x = transpose(x, {0, 2, 1, 3}); //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; Expr ret = bdot(x, W, false, true); @@ -174,29 +171,35 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { - //std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; - //std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; + std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; + std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; + Expr tmp = transpose(input1, {0, 2, 1, 3}); + //std::cerr << "x=" << x->shape() << std::endl; + //std::cerr << "W=" << W->shape() << std::endl; factorLogits = affineShortlist( - input1, + tmp, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - //std::cerr << "affineShortlist.factorLogits.1=" << factorLogits->shape() << std::endl; + std::cerr << "affineShortlist.factorLogits.1=" << factorLogits->shape() << std::endl; factorLogits = transpose(factorLogits, {0, 2, 1, 3}); - //std::cerr << "affineShortlist.factorLogits.2=" << factorLogits->shape() << std::endl; + std::cerr << "affineShortlist.factorLogits.2=" << factorLogits->shape() << std::endl; } else { - //std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; - //std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; + std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; + std::cerr << "affineOrDot.factorWt.1=" << factorWt->shape() << std::endl; + //factorWt = transpose(factorWt, {1, 0, 2, 3}); + //std::cerr << "affineOrDot.factorWt.2=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - //std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl; + std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl; } + std::cerr << std::endl; // optionally add lemma-dependent bias if(Plemma) { // [B... x U0] @@ -210,6 +213,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { auto b = dot(Plemma, lemmaBt, false, true); // [B... x U] factorLogits = factorLogits + b; } + //std::cerr << "factorLogits=" << factorLogits->shape() << std::endl; allLogits[g] = New(factorLogits, nullptr); // optionally add a soft embedding of lemma back to create some lemma dependency // @TODO: if this works, move it into lazyConstruct From f0251889f2a22cfb641fa2a0b287df2464eff3b8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Jun 2021 00:57:02 -0700 Subject: [PATCH 069/254] debug --- src/data/shortlist.cpp | 26 +------------------------- src/layers/logits.cpp | 12 ------------ src/layers/output.cpp | 8 -------- src/translator/beam_search.cpp | 1 - 4 files changed, 1 insertion(+), 46 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 305bc9282..7fba4b676 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -129,19 +129,9 @@ LSHShortlist::LSHShortlist(int k, int nbits) //#define BLAS_FOUND 1 WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { - std::cerr << "\nbatchIdx=" << batchIdx - << " beamIdx=" << beamIdx - << " idx=" << idx - << " k_=" << k_ - << std::endl; - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - int currBeamSize = indicesExpr_->shape()[0]; + //int currBeamSize = indicesExpr_->shape()[0]; int currBatchSize = indicesExpr_->shape()[1]; - std::cerr << "currBatchSize=" << currBatchSize << " currBeamSize=" << currBeamSize << std::endl; - std::cerr << "indices_=" << indices_.size() << std::endl; idx = (k_ * currBatchSize * beamIdx) + (k_ * batchIdx) + idx; - //idx = (k_ * currBeamSize * batchIdx) + (k_ * beamIdx) + idx; - std::cerr << "idx=" << idx << std::endl; assert(idx < indices_.size()); return indices_[idx]; } @@ -157,8 +147,6 @@ WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { } Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { - std::cerr << "batchSize=" << batchSize << " currBeamSize=" << currBeamSize << std::endl; - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << " " << indicesExpr_->val() << std::endl; assert(indicesExpr_->shape()[0] == currBeamSize); assert(indicesExpr_->shape()[1] == batchSize); Expr ret = transpose(indicesExpr_, {1, 0, 2}); @@ -169,8 +157,6 @@ Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { #if BLAS_FOUND - static int c = 0; - std::cerr << "c=" << c++ << std::endl; ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, "LSH index (--output-approx-knn) currently not implemented for GPU"); @@ -194,7 +180,6 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, index_->add( vRows, values->val()->data()); } - std::cerr << "query=" << query->shape() << std::endl; int qRows = query->shape().elements() / dim; std::vector distances(qRows * k_); std::vector ids(qRows * k_); @@ -217,7 +202,6 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, }; Shape kShape({currBeamSize, batchSize, k_}); - std::cerr << "kShape=" << kShape << std::endl; indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; @@ -236,7 +220,6 @@ void LSHShortlist::broadcast(Expr weights, Expr lemmaEt, Expr indicesExprBC, int k) { - std::cerr << "indicesExprBC.0=" << indicesExprBC->shape() << std::endl; int currBeamSize = indicesExprBC->shape()[0]; int batchSize = indicesExprBC->shape()[1]; //int numHypos = batchSize * currBeamSize; @@ -248,12 +231,8 @@ void LSHShortlist::broadcast(Expr weights, indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; - std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; - std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); - std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); - std::cerr << "cachedShortWt_.2=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); @@ -262,11 +241,8 @@ void LSHShortlist::broadcast(Expr weights, } cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); - //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 1, 0, 3}); - //std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 9eb4c2903..06bafb1cf 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -95,14 +95,12 @@ Expr Logits::getFactoredLogits(size_t groupIndex, ABORT_IF(empty(), "Attempted to read out logits on empty Logits object"); auto sel = logits_[groupIndex]->loss(); // [localBeamSize, 1, dimBatch, dimFactorVocab] - std::cerr << "sel.1=" << sel->shape() << std::endl; // normalize for decoding: // - all secondary factors: subtract their max // - lemma: add all maxes of applicable factors if(groupIndex > 0) { sel = sel - max(sel, -1); - std::cerr << "sel.2=" << sel->shape() << std::endl; } else { auto numGroups = getNumFactorGroups(); for(size_t g = 1; g < numGroups; g++) { @@ -113,7 +111,6 @@ Expr Logits::getFactoredLogits(size_t groupIndex, factorMasks = constant(getFactorMasks(g, std::vector())); } else { - std::cerr << "sel.3=" << sel->shape() << std::endl; auto forward = [this, g](Expr out, const std::vector& inputs) { Expr lastIndices = inputs[0]; std::vector masks = getFactorMasksMultiDim(g, lastIndices); @@ -123,27 +120,18 @@ Expr Logits::getFactoredLogits(size_t groupIndex, int currBeamSize = sel->shape()[0]; int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); - std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; factorMasks = transpose(factorMasks, {1, 0, 2}); - std::cerr << "factorMasks.2=" << factorMasks->shape() << std::endl; const Shape &s = factorMasks->shape(); factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); - std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); - std::cerr << "factorMaxima=" << factorMaxima->shape() << std::endl; factorMasks = cast(factorMasks, sel->value_type()); - std::cerr << "factorMasks.4=" << factorMasks->shape() << std::endl; Expr tmp = factorMaxima * factorMasks; - std::cerr << "tmp=" << tmp->shape() << std::endl; - std::cerr << "sel.4=" << sel->shape() << std::endl; sel = sel + tmp; // those lemmas that don't have a factor // get multiplied with 0 - std::cerr << "sel.5=" << sel->shape() << std::endl; } } diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 2d5df585b..8b3d1af07 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -171,8 +171,6 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { - std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; - std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; Expr tmp = transpose(input1, {0, 2, 1, 3}); //std::cerr << "x=" << x->shape() << std::endl; //std::cerr << "W=" << W->shape() << std::endl; @@ -182,13 +180,9 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - std::cerr << "affineShortlist.factorLogits.1=" << factorLogits->shape() << std::endl; factorLogits = transpose(factorLogits, {0, 2, 1, 3}); - std::cerr << "affineShortlist.factorLogits.2=" << factorLogits->shape() << std::endl; } else { - std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; - std::cerr << "affineOrDot.factorWt.1=" << factorWt->shape() << std::endl; //factorWt = transpose(factorWt, {1, 0, 2, 3}); //std::cerr << "affineOrDot.factorWt.2=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( @@ -197,9 +191,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl; } - std::cerr << std::endl; // optionally add lemma-dependent bias if(Plemma) { // [B... x U0] diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index cefa99376..b15840072 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -101,7 +101,6 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - std::cerr << "currentBatchId=" << currentBatchIdx << " origBatchIdx=" << origBatchIdx << std::endl; word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) currentBatchIdx, (int) prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", From 49998217d9de2c5bfbd65f6ba9b2f4058c0e88f5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Jun 2021 22:47:15 +0000 Subject: [PATCH 070/254] don't transpose lastIndices. Works for lsh --- src/data/shortlist.cpp | 5 +++-- src/layers/logits.cpp | 6 ++++-- src/layers/output.cpp | 10 +++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 7fba4b676..8efd70d49 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -149,8 +149,9 @@ WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { assert(indicesExpr_->shape()[0] == currBeamSize); assert(indicesExpr_->shape()[1] == batchSize); - Expr ret = transpose(indicesExpr_, {1, 0, 2}); - return ret; + return indicesExpr_; + //Expr ret = transpose(indicesExpr_, {1, 0, 2}); + //return ret; } #define BLAS_FOUND 1 diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 06bafb1cf..5005f6011 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -120,11 +120,13 @@ Expr Logits::getFactoredLogits(size_t groupIndex, int currBeamSize = sel->shape()[0]; int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); + std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - factorMasks = transpose(factorMasks, {1, 0, 2}); - + std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + const Shape &s = factorMasks->shape(); factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); + std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); factorMasks = cast(factorMasks, sel->value_type()); diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 8b3d1af07..eab81124f 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -171,9 +171,11 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { + std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; Expr tmp = transpose(input1, {0, 2, 1, 3}); + std::cerr << "tmp=" << tmp->shape() << std::endl; //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "W=" << W->shape() << std::endl; + std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineShortlist( tmp, factorWt, @@ -181,16 +183,18 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits factorLogits = transpose(factorLogits, {0, 2, 1, 3}); + std::cerr << "affineShortlist.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } else { - //factorWt = transpose(factorWt, {1, 0, 2, 3}); - //std::cerr << "affineOrDot.factorWt.2=" << factorWt->shape() << std::endl; + std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; + std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits + std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } // optionally add lemma-dependent bias From 700dc7fdd1412e14388bf61e57df75cadcaa5319 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Jun 2021 22:55:26 +0000 Subject: [PATCH 071/254] don't transpose lastIndices. Works for lsh & sl --- src/data/shortlist.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 8efd70d49..0649c19ff 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -54,7 +54,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { int k = indicesExpr_->shape()[0]; - Expr ones = indicesExpr_->graph()->constant({batchSize, beamSize, 1}, inits::ones(), Type::float32); + Expr ones = indicesExpr_->graph()->constant({beamSize, batchSize, 1}, inits::ones(), Type::float32); Expr tmp = reshape(indicesExpr_, {1, k}); tmp = cast(tmp, Type::float32); From 8649034760fc937f5b7f523b7ff81bfc2d5bc9f1 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 11 Jun 2021 23:26:10 +0000 Subject: [PATCH 072/254] no need to broadcast --- src/data/shortlist.cpp | 29 +---------------------------- src/layers/logits.cpp | 13 +++++++++---- src/layers/output.cpp | 14 +++++++------- 3 files changed, 17 insertions(+), 39 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 0649c19ff..84f6c6a29 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -54,34 +54,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { int k = indicesExpr_->shape()[0]; - Expr ones = indicesExpr_->graph()->constant({beamSize, batchSize, 1}, inits::ones(), Type::float32); - - Expr tmp = reshape(indicesExpr_, {1, k}); - tmp = cast(tmp, Type::float32); - - Expr out = ones * tmp; - //debug(out, "out.1"); - - auto forward = [](Expr out, const std::vector& inputs) { - Expr in = inputs[0]; - const Shape &shape = in->shape(); - const float *inPtr = in->val()->data(); - uint32_t *outPtr = out->val()->data(); - - for (int i = 0; i < shape.elements(); ++i) { - const float &val = inPtr[i]; - uint32_t valConv = (uint32_t)val; - uint32_t &valOut = outPtr[i]; - valOut = valConv; - //std::cerr << val << " " << valConv << " " << valOut << std::endl; - } - }; - out = lambda({out}, out->shape(), Type::uint32, forward); - //debug(out, "out.2"); - //out = cast(out, Type::uint32); - //std::cerr << "getIndicesExpr.2=" << out->shape() << std::endl; - //out = reshape(out, {k}); - + Expr out = reshape(indicesExpr_, {1, 1, k}); return out; } diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 5005f6011..73169f218 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -120,20 +120,25 @@ Expr Logits::getFactoredLogits(size_t groupIndex, int currBeamSize = sel->shape()[0]; int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); - std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; + //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; + //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; const Shape &s = factorMasks->shape(); factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); - std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; + //std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); factorMasks = cast(factorMasks, sel->value_type()); + //std::cerr << "factorMaxima=" << factorMaxima->shape() << std::endl; + //std::cerr << "factorMasks.4=" << factorMasks->shape() << std::endl; + //std::cerr << "sel.1=" << sel->shape() << std::endl; Expr tmp = factorMaxima * factorMasks; + //std::cerr << "tmp=" << tmp->shape() << std::endl; sel = sel + tmp; // those lemmas that don't have a factor - // get multiplied with 0 + //std::cerr << "sel.2=" << sel->shape() << std::endl; + //std::cerr << std::endl; } } diff --git a/src/layers/output.cpp b/src/layers/output.cpp index eab81124f..03e775452 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -171,11 +171,11 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { - std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; + //std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; Expr tmp = transpose(input1, {0, 2, 1, 3}); - std::cerr << "tmp=" << tmp->shape() << std::endl; + //std::cerr << "tmp=" << tmp->shape() << std::endl; //std::cerr << "x=" << x->shape() << std::endl; - std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; + //std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineShortlist( tmp, factorWt, @@ -183,18 +183,18 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits factorLogits = transpose(factorLogits, {0, 2, 1, 3}); - std::cerr << "affineShortlist.factorLogits=" << factorLogits->shape() << std::endl << std::endl; + //std::cerr << "affineShortlist.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } else { - std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; - std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; + //std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; + //std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl << std::endl; + //std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } // optionally add lemma-dependent bias From cc295938ced5bfc079034eca035d907c07bfe7a5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 22:29:27 +0000 Subject: [PATCH 073/254] incorrect dimension order --- src/data/shortlist.cpp | 32 +++++++++++++++++++++----------- src/data/shortlist.h | 1 - 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 84f6c6a29..91fcc6964 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -177,10 +177,12 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Shape kShape({currBeamSize, batchSize, k_}); + //std::cerr << "input=" << input->shape() << std::endl; + //std::cerr << "weights=" << weights->shape() << std::endl; indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - broadcast(weights, isLegacyUntransposedW, b, lemmaEt, indicesExpr_, k_); + broadcast(weights, isLegacyUntransposedW, b, lemmaEt, k_); #else input; weights; isLegacyUntransposedW; b; lemmaEt; @@ -192,31 +194,39 @@ void LSHShortlist::broadcast(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, - Expr indicesExprBC, int k) { - int currBeamSize = indicesExprBC->shape()[0]; - int batchSize = indicesExprBC->shape()[1]; + std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; + int currBeamSize = indicesExpr_->shape()[0]; + int batchSize = indicesExpr_->shape()[1]; //int numHypos = batchSize * currBeamSize; //std::cerr << "batchSize=" << batchSize << std::endl; //std::cerr << "currBeamSize=" << currBeamSize << std::endl; //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); - indicesExprBC = reshape(indicesExprBC, {indicesExprBC->shape().elements()}); - //std::cerr << "indicesExprBC.2=" << indicesExprBC->shape() << std::endl; + Expr indicesExprFlatten = reshape(indicesExpr_, {indicesExpr_->shape().elements()}); + std::cerr << "indicesExprFlatten=" << indicesExprFlatten->shape() << std::endl; - cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprBC); + std::cerr << "weights=" << weights->shape() << std::endl; + cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprFlatten); + std::cerr << "cachedShortWt.1_=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); + std::cerr << "cachedShortWt.2_=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); - cachedShortb_ = index_select(b, -1, indicesExprBC); + cachedShortb_ = index_select(b, -1, indicesExprFlatten); cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested } - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprBC); - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {cachedShortLemmaEt_->shape()[0], batchSize, currBeamSize, k}); - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {2, 1, 0, 3}); + std::cerr << "lemmaEt=" << lemmaEt->shape() << std::endl; + int dim = lemmaEt->shape()[0]; + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten); + std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k}); + std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); + std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 1c1243abe..e59a8fa01 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -77,7 +77,6 @@ class LSHShortlist: public Shortlist { bool isLegacyUntransposedW, Expr b, Expr lemmaEt, - Expr indicesExprBC, int k); public: From dffbb47eea062d562098f200beb53861af1c5d0b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 23:18:21 +0000 Subject: [PATCH 074/254] rename broadcast -> createCachedTensors --- src/data/shortlist.cpp | 17 ++++------------- src/data/shortlist.h | 4 ++-- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 91fcc6964..3eaa15f75 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -48,7 +48,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - broadcast(weights, isLegacyUntransposedW, b, lemmaEt, k); + createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k); done_ = true; } @@ -58,7 +58,7 @@ Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { return out; } -void Shortlist::broadcast(Expr weights, +void Shortlist::createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, @@ -182,7 +182,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; - broadcast(weights, isLegacyUntransposedW, b, lemmaEt, k_); + createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k_); #else input; weights; isLegacyUntransposedW; b; lemmaEt; @@ -190,12 +190,11 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, #endif } -void LSHShortlist::broadcast(Expr weights, +void LSHShortlist::createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, int k) { - std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; int currBeamSize = indicesExpr_->shape()[0]; int batchSize = indicesExpr_->shape()[1]; //int numHypos = batchSize * currBeamSize; @@ -205,13 +204,9 @@ void LSHShortlist::broadcast(Expr weights, ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); Expr indicesExprFlatten = reshape(indicesExpr_, {indicesExpr_->shape().elements()}); - std::cerr << "indicesExprFlatten=" << indicesExprFlatten->shape() << std::endl; - std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprFlatten); - std::cerr << "cachedShortWt.1_=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); - std::cerr << "cachedShortWt.2_=" << cachedShortWt_->shape() << std::endl; if (b) { ABORT("Bias not yet tested"); @@ -219,14 +214,10 @@ void LSHShortlist::broadcast(Expr weights, cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested } - std::cerr << "lemmaEt=" << lemmaEt->shape() << std::endl; int dim = lemmaEt->shape()[0]; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten); - std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k}); - std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); - std::cerr << "cachedShortLemmaEt.3_=" << cachedShortLemmaEt_->shape() << std::endl; } LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index e59a8fa01..0ed6eae38 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -31,7 +31,7 @@ class Shortlist { Expr cachedShortLemmaEt_; bool done_; - void broadcast(Expr weights, + void createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, @@ -73,7 +73,7 @@ class LSHShortlist: public Shortlist { static Ptr index_; - void broadcast(Expr weights, + void createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt, From 8c04f6647422ce66b9ee7e9c6ff7b1c03c5f0a80 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 15 Jun 2021 00:10:08 +0000 Subject: [PATCH 075/254] reverse batch beam argument order --- src/data/shortlist.cpp | 2 +- src/data/shortlist.h | 4 ++-- src/translator/beam_search.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 3eaa15f75..8bc25178a 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -101,7 +101,7 @@ LSHShortlist::LSHShortlist(int k, int nbits) //#define BLAS_FOUND 1 -WordIndex LSHShortlist::reverseMap(int batchIdx, int beamIdx, int idx) const { +WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { //int currBeamSize = indicesExpr_->shape()[0]; int currBatchSize = indicesExpr_->shape()[1]; idx = (k_ * currBatchSize * beamIdx) + (k_ * batchIdx) + idx; diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 0ed6eae38..2b8953bd2 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -42,7 +42,7 @@ class Shortlist { Shortlist(const std::vector& indices); virtual ~Shortlist(); - virtual WordIndex reverseMap(int batchIdx, int beamIdx, int idx) const; + virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const; virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); @@ -81,7 +81,7 @@ class LSHShortlist: public Shortlist { public: LSHShortlist(int k, int nbits); - virtual WordIndex reverseMap(int batchIdx, int beamIdx, int idx) const override; + virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const override; virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index b15840072..eda288a4d 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -101,7 +101,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) currentBatchIdx, (int) prevBeamHypIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -122,7 +122,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap((int) origBatchIdx, (int) prevBeamHypIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); From 5b7b1f7e5cab36be6b0ed4293b93cd8831bfa169 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 18:44:05 -0700 Subject: [PATCH 076/254] no need for args in getIndicesExpr(). Deleted debugging --- src/data/shortlist.cpp | 25 ++----------------------- src/data/shortlist.h | 4 ++-- src/layers/logits.cpp | 14 ++++---------- 3 files changed, 8 insertions(+), 35 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 8bc25178a..de6f83029 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -52,7 +52,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp done_ = true; } -Expr Shortlist::getIndicesExpr(int batchSize, int beamSize) const { +Expr Shortlist::getIndicesExpr() const { int k = indicesExpr_->shape()[0]; Expr out = reshape(indicesExpr_, {1, 1, k}); return out; @@ -63,13 +63,8 @@ void Shortlist::createCachedTensors(Expr weights, Expr b, Expr lemmaEt, int k) { - //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); - - //std::cerr << "currBeamSize=" << currBeamSize << " batchSize=" << batchSize << std::endl; - //std::cerr << "weights=" << weights->shape() << std::endl; cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_); - //std::cerr << "cachedShortWt_.1=" << cachedShortWt_->shape() << std::endl; cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); if (b) { @@ -78,11 +73,8 @@ void Shortlist::createCachedTensors(Expr weights, cachedShortb_ = reshape(cachedShortb_, {1, k, 1, cachedShortb_->shape()[1]}); // not tested } - //std::cerr << "lemmaEt.1_=" << lemmaEt->shape() << std::endl; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); - //std::cerr << "cachedShortLemmaEt.1_=" << cachedShortLemmaEt_->shape() << std::endl; cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); - //std::cerr << "cachedShortLemmaEt.2_=" << cachedShortLemmaEt_->shape() << std::endl; } /////////////////////////////////////////////////////////////////////////////////// @@ -110,7 +102,6 @@ WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { } WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { - //utils::Debug(indices_, "LSHShortlist::tryForwardMap indices_"); auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); bool found = first != indices_.end(); if(found && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx @@ -119,12 +110,8 @@ WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? } -Expr LSHShortlist::getIndicesExpr(int batchSize, int currBeamSize) const { - assert(indicesExpr_->shape()[0] == currBeamSize); - assert(indicesExpr_->shape()[1] == batchSize); +Expr LSHShortlist::getIndicesExpr() const { return indicesExpr_; - //Expr ret = transpose(indicesExpr_, {1, 0, 2}); - //return ret; } #define BLAS_FOUND 1 @@ -176,11 +163,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, }; Shape kShape({currBeamSize, batchSize, k_}); - - //std::cerr << "input=" << input->shape() << std::endl; - //std::cerr << "weights=" << weights->shape() << std::endl; indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); - //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k_); @@ -197,10 +180,6 @@ void LSHShortlist::createCachedTensors(Expr weights, int k) { int currBeamSize = indicesExpr_->shape()[0]; int batchSize = indicesExpr_->shape()[1]; - //int numHypos = batchSize * currBeamSize; - //std::cerr << "batchSize=" << batchSize << std::endl; - //std::cerr << "currBeamSize=" << currBeamSize << std::endl; - //std::cerr << "isLegacyUntransposedW=" << isLegacyUntransposedW << std::endl; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); Expr indicesExprFlatten = reshape(indicesExpr_, {indicesExpr_->shape().elements()}); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 2b8953bd2..526a2fa36 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -46,7 +46,7 @@ class Shortlist { virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); - virtual Expr getIndicesExpr(int batchSize, int currBeamSize) const; + virtual Expr getIndicesExpr() const; virtual Expr getCachedShortWt() const { return cachedShortWt_; } virtual Expr getCachedShortb() const { return cachedShortb_; } virtual Expr getCachedShortLemmaEt() const { return cachedShortLemmaEt_; } @@ -85,7 +85,7 @@ class LSHShortlist: public Shortlist { virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; - virtual Expr getIndicesExpr(int batchSize,int currBeamSize) const override; + virtual Expr getIndicesExpr() const override; }; diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 73169f218..109f27024 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -119,26 +119,20 @@ Expr Logits::getFactoredLogits(size_t groupIndex, int currBeamSize = sel->shape()[0]; int batchSize = sel->shape()[2]; - Expr lastIndices = shortlist->getIndicesExpr(batchSize, currBeamSize); - //std::cerr << "lastIndices=" << lastIndices->shape() << std::endl; + Expr lastIndices = shortlist->getIndicesExpr(); + assert(lastIndices->shape()[0] == currBeamSize || lastIndices->shape()[0] == 1); + assert(lastIndices->shape()[1] == batchSize || lastIndices->shape()[1] == 1); + factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); - //std::cerr << "factorMasks.1=" << factorMasks->shape() << std::endl; const Shape &s = factorMasks->shape(); factorMasks = reshape(factorMasks, {s[0], 1, s[1], s[2]}); - //std::cerr << "factorMasks.3=" << factorMasks->shape() << std::endl; } factorMaxima = cast(factorMaxima, sel->value_type()); factorMasks = cast(factorMasks, sel->value_type()); - //std::cerr << "factorMaxima=" << factorMaxima->shape() << std::endl; - //std::cerr << "factorMasks.4=" << factorMasks->shape() << std::endl; - //std::cerr << "sel.1=" << sel->shape() << std::endl; Expr tmp = factorMaxima * factorMasks; - //std::cerr << "tmp=" << tmp->shape() << std::endl; sel = sel + tmp; // those lemmas that don't have a factor - //std::cerr << "sel.2=" << sel->shape() << std::endl; - //std::cerr << std::endl; } } From 5362c2cc0e9448593f88bd0e557e883a7a633093 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 18:47:08 -0700 Subject: [PATCH 077/254] don't define BLAS_FOUND --- src/data/shortlist.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index de6f83029..97a115141 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -114,8 +114,6 @@ Expr LSHShortlist::getIndicesExpr() const { return indicesExpr_; } -#define BLAS_FOUND 1 - void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { #if BLAS_FOUND ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, From 82fa059a03c8c7d915b5220328adfaf71270f85d Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 19:03:53 -0700 Subject: [PATCH 078/254] 'use' variables --- src/layers/logits.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 109f27024..e8fe691b9 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -122,7 +122,8 @@ Expr Logits::getFactoredLogits(size_t groupIndex, Expr lastIndices = shortlist->getIndicesExpr(); assert(lastIndices->shape()[0] == currBeamSize || lastIndices->shape()[0] == 1); assert(lastIndices->shape()[1] == batchSize || lastIndices->shape()[1] == 1); - + currBeamSize; currBeamSize; + factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); const Shape &s = factorMasks->shape(); From 7e6ec58507a946ee9e00aa80ea0835aff068b319 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 14 Jun 2021 19:07:17 -0700 Subject: [PATCH 079/254] delete variables altogether --- src/layers/logits.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index e8fe691b9..1830741ec 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -117,12 +117,11 @@ Expr Logits::getFactoredLogits(size_t groupIndex, out->val()->set(masks); }; - int currBeamSize = sel->shape()[0]; - int batchSize = sel->shape()[2]; + //int currBeamSize = sel->shape()[0]; + //int batchSize = sel->shape()[2]; Expr lastIndices = shortlist->getIndicesExpr(); - assert(lastIndices->shape()[0] == currBeamSize || lastIndices->shape()[0] == 1); - assert(lastIndices->shape()[1] == batchSize || lastIndices->shape()[1] == 1); - currBeamSize; currBeamSize; + //assert(lastIndices->shape()[0] == currBeamSize || lastIndices->shape()[0] == 1); + //assert(lastIndices->shape()[1] == batchSize || lastIndices->shape()[1] == 1); factorMasks = lambda({lastIndices}, lastIndices->shape(), Type::float32, forward); From 488a532bdf85b71276de662ae160172171ca97fc Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 15 Jun 2021 16:54:17 -0700 Subject: [PATCH 080/254] get lemma size from vocab class --- src/data/factored_vocab.cpp | 5 +++++ src/data/factored_vocab.h | 2 ++ src/data/shortlist.cpp | 18 +++++++++--------- src/data/shortlist.h | 8 ++++---- src/data/vocab.cpp | 4 ++++ src/data/vocab.h | 3 +++ src/data/vocab_base.h | 1 + src/translator/translator.h | 3 ++- 8 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index 17a5bfb74..e68fb5c99 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -244,6 +244,10 @@ void FactoredVocab::rCompleteVocab(std::vector& factorIndices, size_t g) } } +size_t FactoredVocab::lemmaSize() const { + return lemmaSize_; +} + void FactoredVocab::constructGroupInfoFromFactorVocab() { // form groups size_t numGroups = groupPrefixes_.size(); @@ -270,6 +274,7 @@ void FactoredVocab::constructGroupInfoFromFactorVocab() { groupRanges_[g].second = u + 1; groupCounts[g]++; } + lemmaSize_ = groupCounts[0]; for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups LOG(info, "[vocab] Factor group '{}' has {} members", groupPrefixes_[g], groupCounts[g]); if (groupCounts[g] == 0) { // factor group is unused --@TODO: once this is not hard-coded, this is an error condition diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h index 215e92f09..6b96d8cd3 100644 --- a/src/data/factored_vocab.h +++ b/src/data/factored_vocab.h @@ -46,6 +46,7 @@ class FactoredVocab : public IVocab { // factor-specific. These methods are consumed by Output and Embedding. size_t factorVocabSize() const { return factorVocab_.size(); } // total number of factors across all types size_t virtualVocabSize() const { return factorShape_.elements(); } // valid WordIndex range (representing all factor combinations including gaps); virtual and huge + virtual size_t lemmaSize() const override; CSRData csr_rows(const Words& words) const; // sparse matrix for summing up factors from the concatenated embedding matrix for each word @@ -116,6 +117,7 @@ class FactoredVocab : public IVocab { Word eosId_{}; Word unkId_{}; WordLUT vocab_; + size_t lemmaSize_; // factors char factorSeparator_ = '|'; // separator symbol for parsing factored words diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 8bc25178a..832d575be 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -88,9 +88,9 @@ void Shortlist::createCachedTensors(Expr weights, /////////////////////////////////////////////////////////////////////////////////// Ptr LSHShortlist::index_; -LSHShortlist::LSHShortlist(int k, int nbits) +LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) : Shortlist(std::vector()) -, k_(k), nbits_(nbits) { +, k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { //std::cerr << "LSHShortlist" << std::endl; /* for (int i = 0; i < k_; ++i) { @@ -149,9 +149,8 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, index_.reset(new faiss::IndexLSH(dim, nbits_, /*rotate=*/dim != nbits_, /*train_thesholds*/false)); - int vRows = 32121; //47960; //values->shape().elements() / dim; - index_->train(vRows, values->val()->data()); - index_->add( vRows, values->val()->data()); + index_->train(lemmaSize_, values->val()->data()); + index_->add( lemmaSize_, values->val()->data()); } int qRows = query->shape().elements() / dim; @@ -220,13 +219,13 @@ void LSHShortlist::createCachedTensors(Expr weights, cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); } -LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits) - : k_(k), nbits_(nbits) { +LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize) + : k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { //std::cerr << "LSHShortlistGenerator" << std::endl; } Ptr LSHShortlistGenerator::generate(Ptr batch) const { - return New(k_, nbits_); + return New(k_, nbits_, lemmaSize_); } ////////////////////////////////////////////////////////////////////////////////////// @@ -359,7 +358,8 @@ Ptr createShortlistGenerator(Ptr options, size_t trgIdx, bool shared) { if (lshOpts.size() == 2) { - return New(lshOpts[0], lshOpts[1]); + size_t lemmaSize = trgVocab->lemmaSize(); + return New(lshOpts[0], lshOpts[1], lemmaSize); } else { std::vector vals = options->get>("shortlist"); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 2b8953bd2..315fdbcd2 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -70,7 +70,7 @@ class LSHShortlist: public Shortlist { private: int k_; int nbits_; - + size_t lemmaSize_; static Ptr index_; void createCachedTensors(Expr weights, @@ -80,7 +80,7 @@ class LSHShortlist: public Shortlist { int k); public: - LSHShortlist(int k, int nbits); + LSHShortlist(int k, int nbits, size_t lemmaSize); virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const override; virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const override; @@ -93,9 +93,9 @@ class LSHShortlistGenerator : public ShortlistGenerator { private: int k_; int nbits_; - + size_t lemmaSize_; public: - LSHShortlistGenerator(int k, int nbits); + LSHShortlistGenerator(int k, int nbits, size_t lemmaSize); Ptr generate(Ptr batch) const override; }; diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 8a3d49c78..38eddd01e 100644 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -129,6 +129,10 @@ std::string Vocab::surfaceForm(const Words& sentence) const { // number of vocabulary items size_t Vocab::size() const { return vImpl_->size(); } +size_t Vocab::lemmaSize() const { + return vImpl_->lemmaSize(); +} + // number of vocabulary items std::string Vocab::type() const { return vImpl_->type(); } diff --git a/src/data/vocab.h b/src/data/vocab.h index 2ab6b2b0a..f4a7e0b7f 100644 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -61,6 +61,9 @@ class Vocab { // number of vocabulary items size_t size() const; + // number of vocabulary items + size_t lemmaSize() const; + // number of vocabulary items std::string type() const; diff --git a/src/data/vocab_base.h b/src/data/vocab_base.h index fc5120260..419c0e325 100644 --- a/src/data/vocab_base.h +++ b/src/data/vocab_base.h @@ -39,6 +39,7 @@ class IVocab { virtual const std::string& operator[](Word id) const = 0; virtual size_t size() const = 0; + virtual size_t lemmaSize() const { return size(); } virtual std::string type() const = 0; virtual Word getEosId() const = 0; diff --git a/src/translator/translator.h b/src/translator/translator.h index 511a42507..f4f9ec4cd 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -63,8 +63,9 @@ class Translate : public ModelTask { auto srcVocab = corpus_->getVocabs()[0]; std::vector lshOpts = options_->get>("output-approx-knn"); - if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) + if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) { shortlistGenerator_ = data::createShortlistGenerator(options_, srcVocab, trgVocab_, lshOpts, 0, 1, vocabs.front() == vocabs.back()); + } auto devices = Config::getDevices(options_); numDevices_ = devices.size(); From 395a4f94d0d3182600d22203625bd1be1f8a042b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 15 Jun 2021 18:08:45 -0700 Subject: [PATCH 081/254] init vector --- src/data/factored_vocab.cpp | 2 +- src/data/shortlist.cpp | 2 +- src/tensors/cpu/prod.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index e68fb5c99..e26a84799 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -265,7 +265,7 @@ void FactoredVocab::constructGroupInfoFromFactorVocab() { } // determine group index ranges groupRanges_.resize(numGroups, { SIZE_MAX, (size_t)0 }); - std::vector groupCounts(numGroups); // number of group members + std::vector groupCounts(numGroups, 0); // number of group members for (WordIndex u = 0; u < factorVocabSize; u++) { // determine ranges; these must be non-overlapping, verified via groupCounts auto g = factorGroups_[u]; if (groupRanges_[g].first > u) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 36e2d22fb..9943198bc 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -83,7 +83,7 @@ Ptr LSHShortlist::index_; LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) : Shortlist(std::vector()) , k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { - //std::cerr << "LSHShortlist" << std::endl; + std::cerr << "LSHShortlist lemmaSize_=" << lemmaSize_ << std::endl; /* for (int i = 0; i < k_; ++i) { indices_.push_back(i); diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 07cc2b99e..313e6fbd9 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -184,7 +184,7 @@ void ProdBatched(marian::Tensor C, // This loop initializes the array pointers in the same way as the for loop // in the normal sgemm version a few lines below functional::Array dims; - for(size_t i = 0; i < batchC; ++i) { + for(int i = 0; i < batchC; ++i) { cShapeMetaF.dims(i, dims); auto aIndex = aShapeMetaF.bindex(dims); auto bIndex = bShapeMetaF.bindex(dims); From 892554129e2a1700755ed0e97ccb1cd5fa6b76f5 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 16 Jun 2021 10:19:24 -0700 Subject: [PATCH 082/254] lemma Et is optional --- src/data/shortlist.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 9943198bc..808ffd7a9 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -83,7 +83,6 @@ Ptr LSHShortlist::index_; LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) : Shortlist(std::vector()) , k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { - std::cerr << "LSHShortlist lemmaSize_=" << lemmaSize_ << std::endl; /* for (int i = 0; i < k_; ++i) { indices_.push_back(i); @@ -190,10 +189,12 @@ void LSHShortlist::createCachedTensors(Expr weights, cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested } - int dim = lemmaEt->shape()[0]; - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten); - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k}); - cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); + if (lemmaEt) { + int dim = lemmaEt->shape()[0]; + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten); + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k}); + cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); + } } LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize) From 9b4a845cc7db127c8d29c990128893fc974128a4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 16 Jun 2021 11:19:23 -0700 Subject: [PATCH 083/254] clean up bias --- src/data/shortlist.cpp | 4 ++-- src/layers/output.cpp | 28 ++++++++++++++++------------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 808ffd7a9..a965f249d 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -184,9 +184,9 @@ void LSHShortlist::createCachedTensors(Expr weights, cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); if (b) { - ABORT("Bias not yet tested"); + ABORT("Bias not supported with LSH"); cachedShortb_ = index_select(b, -1, indicesExprFlatten); - cachedShortb_ = reshape(cachedShortb_, {currBeamSize, k, batchSize, cachedShortb_->shape()[1]}); // not tested + cachedShortb_ = reshape(cachedShortb_, {currBeamSize, batchSize, k, cachedShortb_->shape()[0]}); // not tested } if (lemmaEt) { diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 03e775452..055f8cae8 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -56,16 +56,28 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { lazyConstruct(input->shape()[-1]); auto affineOrDot = [](Expr x, Expr W, Expr b, bool transA, bool transB) { + /* + std::cerr << "affineOrDot.x=" << x->shape() << std::endl; + std::cerr << "affineOrDot.W=" << W->shape() << std::endl; + std::cerr << "affineOrDot.b=" << b->shape() << std::endl; + std::cerr << "affineOrDot.transA=" << transA << " transB=" << transB << std::endl; + */ if(b) return affine(x, W, b, transA, transB); else return dot(x, W, transA, transB); }; - auto affineShortlist = [](Expr x, Expr W, Expr b, bool , bool ) { - //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "W=" << W->shape() << std::endl; - Expr ret = bdot(x, W, false, true); + auto affineShortlist = [](Expr x, Expr W, Expr b, bool transA, bool transB) { + /* + std::cerr << "affineShortlist.x=" << x->shape() << std::endl; + std::cerr << "affineShortlist.W=" << W->shape() << std::endl; + std::cerr << "affineShortlist.b=" << b->shape() << std::endl; + std::cerr << "affineShortlist.transA=" << transA << " transB=" << transB << std::endl; + */ + ABORT_IF(!(!transA && transB), "Must be transA==0 and transB==1"); + ABORT_IF(b, "affineShortlist not tested with bias"); + Expr ret = bdot(x, W, transA, transB); //std::cerr << "ret.2=" << ret->shape() << std::endl; //std::cerr << std::endl; @@ -171,11 +183,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { // matrix Expr factorLogits; if(g == 0 && shortlist_) { - //std::cerr << "affineShortlist.input1=" << input1->shape() << std::endl; Expr tmp = transpose(input1, {0, 2, 1, 3}); - //std::cerr << "tmp=" << tmp->shape() << std::endl; - //std::cerr << "x=" << x->shape() << std::endl; - //std::cerr << "affineShortlist.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineShortlist( tmp, factorWt, @@ -183,18 +191,14 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits factorLogits = transpose(factorLogits, {0, 2, 1, 3}); - //std::cerr << "affineShortlist.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } else { - //std::cerr << "affineOrDot.input1=" << input1->shape() << std::endl; - //std::cerr << "affineOrDot.factorWt=" << factorWt->shape() << std::endl; factorLogits = affineOrDot( input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits - //std::cerr << "affineOrDot.factorLogits=" << factorLogits->shape() << std::endl << std::endl; } // optionally add lemma-dependent bias From 85eb6adce0d6e3d25f9fcac2fc5763373a9ef4f7 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 16 Jun 2021 12:40:38 -0700 Subject: [PATCH 084/254] update sentencepiece pointer to version with case-awareness --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 6f24a6b52..5bafa8e8c 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 6f24a6b52a521a3467e99a9c175ba9e136905217 +Subproject commit 5bafa8e8c3391bbe9721a16e986408341f95774c From a332e550a5cf236d5ab97fea3a512c3eff5d3947 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 16 Jun 2021 12:56:36 -0700 Subject: [PATCH 085/254] debug --- src/layers/output.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 055f8cae8..964cb724c 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -56,12 +56,12 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { lazyConstruct(input->shape()[-1]); auto affineOrDot = [](Expr x, Expr W, Expr b, bool transA, bool transB) { - /* + std::cerr << "affineOrDot.x=" << x->shape() << std::endl; std::cerr << "affineOrDot.W=" << W->shape() << std::endl; std::cerr << "affineOrDot.b=" << b->shape() << std::endl; std::cerr << "affineOrDot.transA=" << transA << " transB=" << transB << std::endl; - */ + if(b) return affine(x, W, b, transA, transB); else @@ -78,8 +78,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { ABORT_IF(!(!transA && transB), "Must be transA==0 and transB==1"); ABORT_IF(b, "affineShortlist not tested with bias"); Expr ret = bdot(x, W, transA, transB); - - //std::cerr << "ret.2=" << ret->shape() << std::endl; + //std::cerr << "ret=" << ret->shape() << std::endl; //std::cerr << std::endl; return ret; }; From cd292d3b32428b6c1cf57e9eb6ad06b1db1e5452 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 18 Jun 2021 10:18:31 -0700 Subject: [PATCH 086/254] changes for review --- src/common/utils.h | 2 +- src/data/factored_vocab.cpp | 3 +++ src/data/shortlist.cpp | 22 ++-------------------- src/data/shortlist.h | 15 ++++++++------- src/data/vocab.cpp | 2 +- src/data/vocab.h | 2 +- src/layers/logits.cpp | 2 -- src/layers/output.cpp | 4 ++-- src/translator/beam_search.cpp | 2 +- 9 files changed, 19 insertions(+), 35 deletions(-) diff --git a/src/common/utils.h b/src/common/utils.h index d8d387a82..13b50c0bd 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -63,7 +63,7 @@ std::string findReplace(const std::string& in, const std::string& what, const st double parseDouble(std::string s); double parseNumber(std::string s); - +// prints vector values with a custom label. template void Debug(const T *arr, size_t size, const std::string &str) { std::cerr << str << ":" << size << ": "; diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index e26a84799..4c5207dd5 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -274,7 +274,10 @@ void FactoredVocab::constructGroupInfoFromFactorVocab() { groupRanges_[g].second = u + 1; groupCounts[g]++; } + + // required by LSH shortlist lemmaSize_ = groupCounts[0]; + for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups LOG(info, "[vocab] Factor group '{}' has {} members", groupPrefixes_[g], groupCounts[g]); if (groupCounts[g] == 0) { // factor group is unused --@TODO: once this is not hard-coded, this is an error condition diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index a965f249d..b7c034365 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -24,9 +24,9 @@ Shortlist::Shortlist(const std::vector& indices) Shortlist::~Shortlist() {} -WordIndex Shortlist::reverseMap(int , int , int idx) const { return indices_[idx]; } +WordIndex Shortlist::reverseMap(int /*beamIdx*/, int /*batchIdx*/, int idx) const { return indices_[idx]; } -WordIndex Shortlist::tryForwardMap(int , int , WordIndex wIdx) const { +WordIndex Shortlist::tryForwardMap(WordIndex wIdx) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found @@ -83,15 +83,8 @@ Ptr LSHShortlist::index_; LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) : Shortlist(std::vector()) , k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { - /* - for (int i = 0; i < k_; ++i) { - indices_.push_back(i); - } - */ } -//#define BLAS_FOUND 1 - WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { //int currBeamSize = indicesExpr_->shape()[0]; int currBatchSize = indicesExpr_->shape()[1]; @@ -100,15 +93,6 @@ WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { return indices_[idx]; } -WordIndex LSHShortlist::tryForwardMap(int , int , WordIndex wIdx) const { - auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); - bool found = first != indices_.end(); - if(found && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx - return (int)std::distance(indices_.begin(), first); // return coordinate if found - else - return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? -} - Expr LSHShortlist::getIndicesExpr() const { return indicesExpr_; } @@ -128,7 +112,6 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, int dim = values->shape()[-1]; if(!index_) { - //std::cerr << "build lsh index" << std::endl; LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_); index_.reset(new faiss::IndexLSH(dim, nbits_, /*rotate=*/dim != nbits_, @@ -199,7 +182,6 @@ void LSHShortlist::createCachedTensors(Expr weights, LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize) : k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { - //std::cerr << "LSHShortlistGenerator" << std::endl; } Ptr LSHShortlistGenerator::generate(Ptr batch) const { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 1d8903e63..cd96e0d79 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -29,7 +29,7 @@ class Shortlist { Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; - bool done_; + bool done_; // used by batch-level shortlist. Only initialize with 1st call then skip all subsequent calls for same batch void createCachedTensors(Expr weights, bool isLegacyUntransposedW, @@ -43,7 +43,7 @@ class Shortlist { virtual ~Shortlist(); virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const; - virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const; + virtual WordIndex tryForwardMap(WordIndex wIdx) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); virtual Expr getIndicesExpr() const; @@ -66,12 +66,14 @@ class ShortlistGenerator { }; /////////////////////////////////////////////////////////////////////////////////// +// implements SLIDE for faster inference. +// https://arxiv.org/pdf/1903.03129.pdf class LSHShortlist: public Shortlist { private: - int k_; - int nbits_; - size_t lemmaSize_; - static Ptr index_; + int k_; // number of candidates returned from each input + int nbits_; // length of hash + size_t lemmaSize_; // vocab size + static Ptr index_; // LSH index to store all possible candidates void createCachedTensors(Expr weights, bool isLegacyUntransposedW, @@ -82,7 +84,6 @@ class LSHShortlist: public Shortlist { public: LSHShortlist(int k, int nbits, size_t lemmaSize); virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const override; - virtual WordIndex tryForwardMap(int batchIdx, int beamIdx, WordIndex wIdx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; virtual Expr getIndicesExpr() const override; diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 38eddd01e..82a4b8da1 100644 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -133,7 +133,7 @@ size_t Vocab::lemmaSize() const { return vImpl_->lemmaSize(); } -// number of vocabulary items +// type of vocabulary items std::string Vocab::type() const { return vImpl_->type(); } // return EOS symbol id diff --git a/src/data/vocab.h b/src/data/vocab.h index f4a7e0b7f..4af82e8e8 100644 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -61,7 +61,7 @@ class Vocab { // number of vocabulary items size_t size() const; - // number of vocabulary items + // number of lemma items. Same as size() except in factored models size_t lemmaSize() const; // number of vocabulary items diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 1830741ec..0bd8aa911 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -247,8 +247,6 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< std::vector Logits::getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 - //std::cerr << "indicesExpr=" << indicesExpr->shape() << std::endl; - //int batchSize int batchSize = indicesExpr->shape()[0]; int currBeamSize = indicesExpr->shape()[1]; int numHypos = batchSize * currBeamSize; diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 964cb724c..21eb3714c 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -56,12 +56,12 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { lazyConstruct(input->shape()[-1]); auto affineOrDot = [](Expr x, Expr W, Expr b, bool transA, bool transB) { - + /* std::cerr << "affineOrDot.x=" << x->shape() << std::endl; std::cerr << "affineOrDot.W=" << W->shape() << std::endl; std::cerr << "affineOrDot.b=" << b->shape() << std::endl; std::cerr << "affineOrDot.transA=" << transA << " transB=" << transB << std::endl; - + */ if(b) return affine(x, W, b, transA, transB); else diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index eda288a4d..94de3db06 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -315,7 +315,7 @@ Histories BeamSearch::search(Ptr graph, Ptr suppressed.erase(std::remove_if(suppressed.begin(), suppressed.end(), [&](WordIndex i) { - return shortlist->tryForwardMap(4545, 3343, i) == data::Shortlist::npos; // TODO beamIdx + return shortlist->tryForwardMap(i) == data::Shortlist::npos; // TODO beamIdx }), suppressed.end()); From fc0f41f24a974ff30829e9fbdbd9b7d324279110 Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Mon, 28 Jun 2021 23:15:23 +0000 Subject: [PATCH 087/254] Merged PR 19597: Enable mpi wrapper to use size larger than MAX_INT Enable mpi wrapper to use size larger than MAX_INT. --- CHANGELOG.md | 1 + VERSION | 2 +- src/training/communicator.cpp | 61 ++++++++++++++++++++++++++++++++--- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f4141c84..5cb7c305b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Compute aligned memory sizes using exact sizing ### Fixed +- Added support to MPIWrappest::bcast (and similar) for count of type size_t - Adding new validation metrics when training is restarted and --reset-valid-stalled is used - Missing depth-scaling in transformer FFN - Fixed an issue when loading intgemm16 models from unaligned memory. diff --git a/VERSION b/VERSION index c1cadea12..e7f4fc036 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.20 +v1.10.21 diff --git a/src/training/communicator.cpp b/src/training/communicator.cpp index 4b93fa9ec..55d4991bc 100644 --- a/src/training/communicator.cpp +++ b/src/training/communicator.cpp @@ -123,20 +123,73 @@ class MPIWrapper : public IMPIWrapper virtual void barrier(MPI_Comm comm = MPI_COMM_WORLD) const override { HANDLE_MPI_ERROR(MPI_Barrier(comm)); } + virtual void bCast(void* buf, size_t count, MPI_Datatype datatype, size_t rootRank, MPI_Comm comm = MPI_COMM_WORLD) const override { - HANDLE_MPI_ERROR(MPI_Bcast(buf, (int)count, datatype, (int)rootRank, comm)); + // MPI_Bcast only supports MAX_INT count, here and in the functions below, we need to cycle through the counts until we have sent + // all elemements if count is larger MAX_INT. + + // get the data type size in bytes + int datatypeSize; + HANDLE_MPI_ERROR(MPI_Type_size(datatype, &datatypeSize)); + + // get the limit for int count + size_t limit = (size_t)std::numeric_limits::max(); + size_t remaining = count, offset = 0; + + // while there are elements that we have not sent yet, loop until all has been sent in chunks of at most `limit`. + while(remaining > 0) { + int intCount = (int)std::min(remaining, limit); + HANDLE_MPI_ERROR(MPI_Bcast((char*)buf + offset * (size_t)datatypeSize, intCount, datatype, (int)rootRank, comm)); + offset += (size_t)intCount; + remaining -= (size_t)intCount; + } } + virtual void sSend(void* buf, size_t count, MPI_Datatype datatype, size_t destRank, int tag, MPI_Comm comm) const override { - HANDLE_MPI_ERROR(MPI_Ssend(buf, (int)count, datatype, (int)destRank, tag, comm)); + int datatypeSize; + HANDLE_MPI_ERROR(MPI_Type_size(datatype, &datatypeSize)); + + size_t limit = (size_t)std::numeric_limits::max(); + size_t remaining = count, offset = 0; + while(remaining > 0) { + int intCount = (int)std::min(remaining, limit); + HANDLE_MPI_ERROR(MPI_Ssend((char*)buf + offset * (size_t)datatypeSize, intCount, datatype, (int)destRank, tag, comm)); + offset += (size_t)intCount; + remaining -= (size_t)intCount; + } } + virtual void recv(void* buf, size_t count, MPI_Datatype datatype, size_t sourceRank, int tag, MPI_Comm comm, MPI_Status* status) const override { - HANDLE_MPI_ERROR(MPI_Recv(buf, (int)count, datatype, (int)sourceRank, tag, comm, status)); + int datatypeSize; + HANDLE_MPI_ERROR(MPI_Type_size(datatype, &datatypeSize)); + + size_t limit = (size_t)std::numeric_limits::max(); + size_t remaining = count, offset = 0; + while(remaining > 0) { + int intCount = (int)std::min(remaining, limit); + HANDLE_MPI_ERROR(MPI_Recv((char*)buf + offset * (size_t)datatypeSize, intCount, datatype, (int)sourceRank, tag, comm, status)); + offset += (size_t)intCount; + remaining -= (size_t)intCount; + } } + virtual void allReduce(const void* sendbuf, void* recvbuf, size_t count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) const override { if (sendbuf == recvbuf) sendbuf = MPI_IN_PLACE; // MSMPI requires this - HANDLE_MPI_ERROR(MPI_Allreduce(sendbuf, recvbuf, (int)count, datatype, op, comm)); + + int datatypeSize; + HANDLE_MPI_ERROR(MPI_Type_size(datatype, &datatypeSize)); + + size_t limit = (size_t)std::numeric_limits::max(); + size_t remaining = count, offset = 0; + while(remaining > 0) { + int intCount = (int)std::min(remaining, limit); + HANDLE_MPI_ERROR(MPI_Allreduce((char*)sendbuf + offset * (size_t)datatypeSize, (char*)recvbuf + offset * (size_t)datatypeSize, intCount, datatype, op, comm)); + offset += (size_t)intCount; + remaining -= (size_t)intCount; + } } + virtual void finalize() override { HANDLE_MPI_ERROR(MPI_Finalize()); } From 8daa0a42559a9bf3e8bed171423b9e1b08c7e000 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 28 Jun 2021 19:26:40 -0700 Subject: [PATCH 088/254] fix compilation errors due to narrow conversion --- src/tensors/cpu/prod.cpp | 34 +++++++++++++++++----------------- src/tensors/gpu/prod.cpp | 22 +++++++++++----------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 313e6fbd9..8fcca924b 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -117,28 +117,28 @@ void ProdBatched(marian::Tensor C, } cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); - size_t m = aShape[-2]; - size_t k = aShape[-1]; + int m = aShape[-2]; + int k = aShape[-1]; if(transA) std::swap(m, k); - size_t l = bShape[-2]; - size_t n = bShape[-1]; + int l = bShape[-2]; + int n = bShape[-1]; if(transB) std::swap(l, n); - size_t lda = aShape[-1]; - size_t ldb = bShape[-1]; - size_t ldc = bShape[-1]; + int lda = aShape[-1]; + int ldb = bShape[-1]; + int ldc = bShape[-1]; if(transB) ldc = bShape[-2]; - auto strideA = m * k; - auto strideB = n * k; - auto strideC = n * m; + int strideA = m * k; + int strideB = n * k; + int strideC = n * m; - auto batchC = cShapeMeta.elements(); + int batchC = cShapeMeta.elements(); // Convert to functional shapes to be able to map dimensions. @TODO merge this functional::Shape aShapeMetaF = aShapeMeta; @@ -218,17 +218,17 @@ void ProdBatched(marian::Tensor C, sgemm(transA, transB, - (int)m, - (int)n, - (int)k, + m, + n, + k, alpha, A->data() + aIndex * strideA, - (int)lda, + lda, B->data() + bIndex * strideB, - (int)ldb, + ldb, beta, C->data() + i * strideC, - (int)ldc); + ldc); } #endif #else diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index e996f58f2..bf0d23957 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -371,19 +371,19 @@ void ProdBatchedTyped(marian::Tensor C, } cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); - size_t m = aShape[-2]; - size_t k = aShape[-1]; + int m = aShape[-2]; + int k = aShape[-1]; if(transA) std::swap(m, k); - size_t l = bShape[-2]; - size_t n = bShape[-1]; + int l = bShape[-2]; + int n = bShape[-1]; if(transB) std::swap(l, n); - size_t lda = aShape[-1]; - size_t ldb = bShape[-1]; - size_t ldc = bShape[-1]; + int lda = aShape[-1]; + int ldb = bShape[-1]; + int ldc = bShape[-1]; if(transB) ldc = bShape[-2]; @@ -395,11 +395,11 @@ void ProdBatchedTyped(marian::Tensor C, auto cublasHandle = backend->getCublasHandle(); auto compute = backend->getCudaComputeCapability(); - auto strideA = m * k; - auto strideB = n * k; - auto strideC = n * m; + int strideA = m * k; + int strideB = n * k; + int strideC = n * m; - auto batchC = cShapeMeta.elements(); + int batchC = cShapeMeta.elements(); // Convert to functional shapes to be able to map dimensions. @TODO merge this functional::Shape aShapeMetaF = aShapeMeta; From 24c644bae0312a5b640b2cda43020fd4b74cdb07 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 28 Jun 2021 21:26:02 -0700 Subject: [PATCH 089/254] pass shortlist regression tests --- src/data/shortlist.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index b7c034365..ad2525dc9 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -68,13 +68,13 @@ void Shortlist::createCachedTensors(Expr weights, cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); if (b) { - ABORT("Bias not yet tested"); cachedShortb_ = index_select(b, -1, indicesExpr_); - cachedShortb_ = reshape(cachedShortb_, {1, k, 1, cachedShortb_->shape()[1]}); // not tested } - cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); + if (lemmaEt) { + cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); + } } /////////////////////////////////////////////////////////////////////////////////// From ff8af52624682180dc415fbfbab1d9b40fc87eea Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 1 Jul 2021 15:39:18 -0700 Subject: [PATCH 090/254] lock index before creation --- src/data/shortlist.cpp | 3 +++ src/data/shortlist.h | 1 + 2 files changed, 4 insertions(+) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index ad2525dc9..b9a48b390 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -79,6 +79,7 @@ void Shortlist::createCachedTensors(Expr weights, /////////////////////////////////////////////////////////////////////////////////// Ptr LSHShortlist::index_; +std::mutex LSHShortlist::mutex_; LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) : Shortlist(std::vector()) @@ -111,6 +112,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, auto values = inputs[1]; int dim = values->shape()[-1]; + mutex_.lock(); if(!index_) { LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_); index_.reset(new faiss::IndexLSH(dim, nbits_, @@ -119,6 +121,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, index_->train(lemmaSize_, values->val()->data()); index_->add( lemmaSize_, values->val()->data()); } + mutex_.unlock(); int qRows = query->shape().elements() / dim; std::vector distances(qRows * k_); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index cd96e0d79..7fc48ec2e 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -74,6 +74,7 @@ class LSHShortlist: public Shortlist { int nbits_; // length of hash size_t lemmaSize_; // vocab size static Ptr index_; // LSH index to store all possible candidates + static std::mutex mutex_; void createCachedTensors(Expr weights, bool isLegacyUntransposedW, From bd1f1ee9cb0cd316a62cbcce6653406979be9a00 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 2 Jul 2021 12:06:03 -0700 Subject: [PATCH 091/254] marcin's review changes --- src/data/shortlist.cpp | 3 ++- src/data/shortlist.h | 2 -- src/layers/logits.cpp | 6 +++--- src/layers/logits.h | 2 +- src/layers/output.cpp | 5 ++--- src/translator/beam_search.cpp | 7 ------- src/translator/translator.h | 2 ++ 7 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index b9a48b390..9f4a4ebd0 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -320,7 +320,8 @@ Ptr createShortlistGenerator(Ptr options, size_t srcIdx, size_t trgIdx, bool shared) { - if (lshOpts.size() == 2) { + if (lshOpts.size()) { + assert(lshOpts.size() == 2); size_t lemmaSize = trgVocab->lemmaSize(); return New(lshOpts[0], lshOpts[1], lemmaSize); } diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 7fc48ec2e..519b6b5f7 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -66,8 +66,6 @@ class ShortlistGenerator { }; /////////////////////////////////////////////////////////////////////////////////// -// implements SLIDE for faster inference. -// https://arxiv.org/pdf/1903.03129.pdf class LSHShortlist: public Shortlist { private: int k_; // number of candidates returned from each input diff --git a/src/layers/logits.cpp b/src/layers/logits.cpp index 0bd8aa911..794323d03 100644 --- a/src/layers/logits.cpp +++ b/src/layers/logits.cpp @@ -62,7 +62,7 @@ Expr Logits::applyLossFunction( auto factorIndices = indices(maskedFactoredLabels.indices); // [B... flattened] factor-label indices, or 0 if factor does not apply auto factorMask = constant(maskedFactoredLabels.masks); // [B... flattened] loss values get multiplied with 0 for labels that don't have this factor auto factorLogits = logits_[g]; // [B... * Ug] label-wise loss values (not aggregated yet) - std::cerr << "g=" << g << " factorLogits->loss()=" << factorLogits->loss()->shape() << std::endl; + //std::cerr << "g=" << g << " factorLogits->loss()=" << factorLogits->loss()->shape() << std::endl; // For each location in [B...] select [indices[B...]]. If not using factor, select [0] and mask it out next. auto factorLoss = lossFn(factorLogits->loss(), factorIndices); // [B... x 1] // clang-format on @@ -113,7 +113,7 @@ Expr Logits::getFactoredLogits(size_t groupIndex, else { auto forward = [this, g](Expr out, const std::vector& inputs) { Expr lastIndices = inputs[0]; - std::vector masks = getFactorMasksMultiDim(g, lastIndices); + std::vector masks = getFactorMasks(g, lastIndices); out->val()->set(masks); }; @@ -245,7 +245,7 @@ std::vector Logits::getFactorMasks(size_t factorGroup, const std::vector< return res; } -std::vector Logits::getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) +std::vector Logits::getFactorMasks(size_t factorGroup, Expr indicesExpr) const { // [lemmaIndex] -> 1.0 for words that do have this factor; else 0 int batchSize = indicesExpr->shape()[0]; int currBeamSize = indicesExpr->shape()[1]; diff --git a/src/layers/logits.h b/src/layers/logits.h index 1a57657d6..a92a01c30 100644 --- a/src/layers/logits.h +++ b/src/layers/logits.h @@ -77,7 +77,7 @@ class Logits { } // actually the same as constant(data) for this data type std::vector getFactorMasks(size_t factorGroup, const std::vector& indices) const; - std::vector getFactorMasksMultiDim(size_t factorGroup, Expr indicesExpr) const; + std::vector getFactorMasks(size_t factorGroup, Expr indicesExpr) const; // same as above but separate indices for each batch and beam private: // members diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 21eb3714c..d7ba4490a 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -75,7 +75,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { std::cerr << "affineShortlist.b=" << b->shape() << std::endl; std::cerr << "affineShortlist.transA=" << transA << " transB=" << transB << std::endl; */ - ABORT_IF(!(!transA && transB), "Must be transA==0 and transB==1"); + ABORT_IF(!(!transA && transB), "affineShortlist. Must be transA==0 and transB==1"); ABORT_IF(b, "affineShortlist not tested with bias"); Expr ret = bdot(x, W, transA, transB); //std::cerr << "ret=" << ret->shape() << std::endl; @@ -83,8 +83,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return ret; }; - if(shortlist_) { // shortlisted versions of parameters are cached within one - // batch, then clear()ed + if(shortlist_) { shortlist_->filter(input, Wt_, isLegacyUntransposedW, b_, lemmaEt_); } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 94de3db06..da529980a 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -20,7 +20,6 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current const std::vector& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use. const std::vector& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx] std::vector align; // collects alignment information from the last executed time step - //utils::Debug(batchIdxMap, "batchIdxMap"); if(options_->hasAndNotEmpty("alignment") && factorGroup == 0) align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble, @@ -86,12 +85,6 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // map wordIdx to word auto prevBeamHypIdx = beamHypIdx; // back pointer - /*std::cerr << "currentBatchIdx=" << currentBatchIdx - << " origBatchIdx=" << origBatchIdx - << " beamHypIdx=" << beamHypIdx - << " prevBeamHypIdx=" << prevBeamHypIdx - << std::endl;*/ - auto prevHyp = beam[prevBeamHypIdx]; Word word; // If short list has been set, then wordIdx is an index into the short-listed word set, diff --git a/src/translator/translator.h b/src/translator/translator.h index f4f9ec4cd..f1acd5a17 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -63,6 +63,8 @@ class Translate : public ModelTask { auto srcVocab = corpus_->getVocabs()[0]; std::vector lshOpts = options_->get>("output-approx-knn"); + ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); + if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) { shortlistGenerator_ = data::createShortlistGenerator(options_, srcVocab, trgVocab_, lshOpts, 0, 1, vocabs.front() == vocabs.back()); } From 9acf27d6bc41ab2de9d64529896a23331a1f5292 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 2 Jul 2021 12:16:57 -0700 Subject: [PATCH 092/254] credit SLIDE --- src/data/shortlist.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 519b6b5f7..05c734133 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -66,6 +66,8 @@ class ShortlistGenerator { }; /////////////////////////////////////////////////////////////////////////////////// +// faster inference inspired by SLIDE. +// https://arxiv.org/pdf/1903.03129.pdf class LSHShortlist: public Shortlist { private: int k_; // number of candidates returned from each input From 5ad0edf6df45b9e618097daf2b3dbb3b98b36e52 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 2 Jul 2021 19:07:42 -0700 Subject: [PATCH 093/254] remove todo --- src/translator/beam_search.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index da529980a..2a0d3947a 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -308,7 +308,7 @@ Histories BeamSearch::search(Ptr graph, Ptr suppressed.erase(std::remove_if(suppressed.begin(), suppressed.end(), [&](WordIndex i) { - return shortlist->tryForwardMap(i) == data::Shortlist::npos; // TODO beamIdx + return shortlist->tryForwardMap(i) == data::Shortlist::npos; }), suppressed.end()); From 4ace42f35aa9fe6d51884ccd5486ba3e8cf626b4 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 2 Jul 2021 20:30:50 -0700 Subject: [PATCH 094/254] paper --- src/data/shortlist.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 05c734133..1ce8fbf40 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -66,8 +66,8 @@ class ShortlistGenerator { }; /////////////////////////////////////////////////////////////////////////////////// -// faster inference inspired by SLIDE. -// https://arxiv.org/pdf/1903.03129.pdf +// faster inference inspired by these 2 papers +// https://arxiv.org/pdf/1903.03129.pdf https://arxiv.org/pdf/1806.00588.pdf class LSHShortlist: public Shortlist { private: int k_; // number of candidates returned from each input From 9772aa293f574aef5fb1a2756ae28ef7428b3dde Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 3 Jul 2021 12:13:26 -0700 Subject: [PATCH 095/254] remaining comments --- src/data/factored_vocab.cpp | 2 +- src/data/shortlist.cpp | 8 ++++---- src/data/shortlist.h | 18 +++++++++--------- src/graph/expression_operators.h | 4 ++++ 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index 4c5207dd5..cc7159938 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -275,7 +275,7 @@ void FactoredVocab::constructGroupInfoFromFactorVocab() { groupCounts[g]++; } - // required by LSH shortlist + // required by LSH shortlist. Factored segmenter encodes the number of lemmas in the first factor group, this corresponds to actual surface forms lemmaSize_ = groupCounts[0]; for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 9f4a4ebd0..f7e229ffe 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -19,8 +19,8 @@ const T* get(const void*& current, size_t num = 1) { ////////////////////////////////////////////////////////////////////////////////////// Shortlist::Shortlist(const std::vector& indices) - : indices_(indices) - , done_(false) {} + : indices_(indices), + initialized_(false) {} Shortlist::~Shortlist() {} @@ -35,7 +35,7 @@ WordIndex Shortlist::tryForwardMap(WordIndex wIdx) const { } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - if (done_) { + if (initialized_) { return; } @@ -49,7 +49,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k); - done_ = true; + initialized_ = true; } Expr Shortlist::getIndicesExpr() const { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 1ce8fbf40..a75d2c4ba 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -29,13 +29,13 @@ class Shortlist { Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; - bool done_; // used by batch-level shortlist. Only initialize with 1st call then skip all subsequent calls for same batch + bool initialized_; // used by batch-level shortlist. Only initialize with 1st call then skip all subsequent calls for same batch void createCachedTensors(Expr weights, - bool isLegacyUntransposedW, - Expr b, - Expr lemmaEt, - int k); + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + int k); public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos @@ -77,10 +77,10 @@ class LSHShortlist: public Shortlist { static std::mutex mutex_; void createCachedTensors(Expr weights, - bool isLegacyUntransposedW, - Expr b, - Expr lemmaEt, - int k); + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + int k); public: LSHShortlist(int k, int nbits, size_t lemmaSize); diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index c1570effe..6c7e5758d 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -478,6 +478,10 @@ Expr bdot(Expr a, bool transB = false, float scalar = 1.f); +/** + * bdot_legacy is an old implemetation of bdot without correct broadcasting on the batch dimensions, + * to be removed once the behavior can be correctly replicated with normal bdot on 5 dimensions. + */ Expr bdot_legacy(Expr a, Expr b, bool transA = false, From 35c822eb4ea29e5445b7c75b665c4872a2cc1adb Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Fri, 9 Jul 2021 20:35:09 +0000 Subject: [PATCH 096/254] Merged PR 19685: Marianize LSH as operators for mmapping and use in Quicksand This PR turns the LSH index and search into a set of operators that live in the expression graph. This makes creation etc. thread-safe (one index per graph) and allows to later implement GPU versions. This allows to mmap the LSH as a Marian parameter since now we only need to turn the index into something that can be saved to disk using the existing tensors. This happens in marian_conv or the equivalent interface function in the Quicksand interface. --- src/3rd_party/faiss/Index.cpp | 119 ---------- src/3rd_party/faiss/Index.h | 177 --------------- src/3rd_party/faiss/IndexLSH.cpp | 224 ------------------- src/3rd_party/faiss/IndexLSH.h | 90 -------- src/3rd_party/faiss/utils/hamming-inl.h | 10 +- src/3rd_party/faiss/utils/hamming.h | 4 +- src/CMakeLists.txt | 1 + src/command/marian_conv.cpp | 39 +++- src/data/shortlist.cpp | 74 +------ src/data/shortlist.h | 3 +- src/graph/expression_graph.h | 10 + src/graph/expression_operators.cpp | 12 +- src/graph/expression_operators.h | 11 +- src/graph/node_initializers.cpp | 30 +-- src/graph/node_initializers.h | 12 +- src/graph/node_operators_binary.h | 22 +- src/graph/node_operators_unary.h | 41 +++- src/layers/lsh.cpp | 233 ++++++++++++++++++++ src/layers/lsh.h | 49 ++++ src/microsoft/quicksand.cpp | 24 +- src/microsoft/quicksand.h | 5 +- src/tensors/cpu/expression_graph_packable.h | 36 ++- src/tensors/tensor.h | 7 +- src/training/training_state.h | 3 +- 24 files changed, 499 insertions(+), 737 deletions(-) delete mode 100644 src/3rd_party/faiss/Index.cpp delete mode 100644 src/3rd_party/faiss/IndexLSH.cpp delete mode 100644 src/3rd_party/faiss/IndexLSH.h create mode 100644 src/layers/lsh.cpp create mode 100644 src/layers/lsh.h diff --git a/src/3rd_party/faiss/Index.cpp b/src/3rd_party/faiss/Index.cpp deleted file mode 100644 index eac5f3d93..000000000 --- a/src/3rd_party/faiss/Index.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#include "Index.h" -#include "common/logging.h" -#include - -namespace faiss { - -Index::~Index () -{ -} - - -void Index::train(idx_t /*n*/, const float* /*x*/) { - // does nothing by default -} - - -void Index::range_search (idx_t , const float *, float, - RangeSearchResult *) const -{ - ABORT ("range search not implemented"); -} - -void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k) -{ - float * distances = new float[n * k]; - ScopeDeleter del(distances); - search (n, x, k, distances, labels); -} - -void Index::add_with_ids( - idx_t /*n*/, - const float* /*x*/, - const idx_t* /*xids*/) { - ABORT ("add_with_ids not implemented for this type of index"); -} - -size_t Index::remove_ids(const IDSelector& /*sel*/) { - ABORT ("remove_ids not implemented for this type of index"); - return -1; -} - - -void Index::reconstruct (idx_t, float * ) const { - ABORT ("reconstruct not implemented for this type of index"); -} - - -void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const { - for (idx_t i = 0; i < ni; i++) { - reconstruct (i0 + i, recons + i * d); - } -} - - -void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k, - float *distances, idx_t *labels, - float *recons) const { - search (n, x, k, distances, labels); - for (idx_t i = 0; i < n; ++i) { - for (idx_t j = 0; j < k; ++j) { - idx_t ij = i * k + j; - idx_t key = labels[ij]; - float* reconstructed = recons + ij * d; - if (key < 0) { - // Fill with NaNs - memset(reconstructed, -1, sizeof(*reconstructed) * d); - } else { - reconstruct (key, reconstructed); - } - } - } -} - -void Index::compute_residual (const float * x, - float * residual, idx_t key) const { - reconstruct (key, residual); - for (size_t i = 0; i < d; i++) { - residual[i] = x[i] - residual[i]; - } -} - -void Index::compute_residual_n (idx_t n, const float* xs, - float* residuals, - const idx_t* keys) const { -//#pragma omp parallel for - for (idx_t i = 0; i < n; ++i) { - compute_residual(&xs[i * d], &residuals[i * d], keys[i]); - } -} - - - -size_t Index::sa_code_size () const -{ - ABORT ("standalone codec not implemented for this type of index"); -} - -void Index::sa_encode (idx_t, const float *, - uint8_t *) const -{ - ABORT ("standalone codec not implemented for this type of index"); -} - -void Index::sa_decode (idx_t, const uint8_t *, - float *) const -{ - ABORT ("standalone codec not implemented for this type of index"); -} - -} diff --git a/src/3rd_party/faiss/Index.h b/src/3rd_party/faiss/Index.h index deaabcaad..24765f7d9 100644 --- a/src/3rd_party/faiss/Index.h +++ b/src/3rd_party/faiss/Index.h @@ -39,11 +39,6 @@ namespace faiss { -/// Forward declarations see AuxIndexStructures.h -struct IDSelector; -struct RangeSearchResult; -struct DistanceComputer; - /** Abstract structure for an index, supports adding vectors and searching them. * * All vectors provided at add or search time are 32-bit float arrays, @@ -53,178 +48,6 @@ struct Index { using idx_t = int64_t; ///< all indices are this type using component_t = float; using distance_t = float; - - int d; ///< vector dimension - idx_t ntotal; ///< total nb of indexed vectors - bool verbose; ///< verbosity level - - /// set if the Index does not require training, or if training is - /// done already - bool is_trained; - - /// type of metric this index uses for search - MetricType metric_type; - float metric_arg; ///< argument of the metric type - - explicit Index (idx_t d = 0, MetricType metric = METRIC_L2): - d((int)d), - ntotal(0), - verbose(false), - is_trained(true), - metric_type (metric), - metric_arg(0) {} - - virtual ~Index (); - - - /** Perform training on a representative set of vectors - * - * @param n nb of training vectors - * @param x training vecors, size n * d - */ - virtual void train(idx_t n, const float* x); - - /** Add n vectors of dimension d to the index. - * - * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1 - * This function slices the input vectors in chuncks smaller than - * blocksize_add and calls add_core. - * @param x input matrix, size n * d - */ - virtual void add (idx_t n, const float *x) = 0; - - /** Same as add, but stores xids instead of sequential ids. - * - * The default implementation fails with an assertion, as it is - * not supported by all indexes. - * - * @param xids if non-null, ids to store for the vectors (size n) - */ - virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids); - - /** query n vectors of dimension d to the index. - * - * return at most k vectors. If there are not enough results for a - * query, the result array is padded with -1s. - * - * @param x input vectors to search, size n * d - * @param labels output labels of the NNs, size n*k - * @param distances output pairwise distances, size n*k - */ - virtual void search (idx_t n, const float *x, idx_t k, - float *distances, idx_t *labels) const = 0; - - /** query n vectors of dimension d to the index. - * - * return all vectors with distance < radius. Note that many - * indexes do not implement the range_search (only the k-NN search - * is mandatory). - * - * @param x input vectors to search, size n * d - * @param radius search radius - * @param result result table - */ - virtual void range_search (idx_t n, const float *x, float radius, - RangeSearchResult *result) const; - - /** return the indexes of the k vectors closest to the query x. - * - * This function is identical as search but only return labels of neighbors. - * @param x input vectors to search, size n * d - * @param labels output labels of the NNs, size n*k - */ - void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1); - - /// removes all elements from the database. - virtual void reset() = 0; - - /** removes IDs from the index. Not supported by all - * indexes. Returns the number of elements removed. - */ - virtual size_t remove_ids (const IDSelector & sel); - - /** Reconstruct a stored vector (or an approximation if lossy coding) - * - * this function may not be defined for some indexes - * @param key id of the vector to reconstruct - * @param recons reconstucted vector (size d) - */ - virtual void reconstruct (idx_t key, float * recons) const; - - /** Reconstruct vectors i0 to i0 + ni - 1 - * - * this function may not be defined for some indexes - * @param recons reconstucted vector (size ni * d) - */ - virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const; - - /** Similar to search, but also reconstructs the stored vectors (or an - * approximation in the case of lossy coding) for the search results. - * - * If there are not enough results for a query, the resulting arrays - * is padded with -1s. - * - * @param recons reconstructed vectors size (n, k, d) - **/ - virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k, - float *distances, idx_t *labels, - float *recons) const; - - /** Computes a residual vector after indexing encoding. - * - * The residual vector is the difference between a vector and the - * reconstruction that can be decoded from its representation in - * the index. The residual can be used for multiple-stage indexing - * methods, like IndexIVF's methods. - * - * @param x input vector, size d - * @param residual output residual vector, size d - * @param key encoded index, as returned by search and assign - */ - virtual void compute_residual (const float * x, - float * residual, idx_t key) const; - - /** Computes a residual vector after indexing encoding (batch form). - * Equivalent to calling compute_residual for each vector. - * - * The residual vector is the difference between a vector and the - * reconstruction that can be decoded from its representation in - * the index. The residual can be used for multiple-stage indexing - * methods, like IndexIVF's methods. - * - * @param n number of vectors - * @param xs input vectors, size (n x d) - * @param residuals output residual vectors, size (n x d) - * @param keys encoded index, as returned by search and assign - */ - virtual void compute_residual_n (idx_t n, const float* xs, - float* residuals, - const idx_t* keys) const; - - /* The standalone codec interface */ - - /** size of the produced codes in bytes */ - virtual size_t sa_code_size () const; - - /** encode a set of vectors - * - * @param n number of vectors - * @param x input vectors, size n * d - * @param bytes output encoded vectors, size n * sa_code_size() - */ - virtual void sa_encode (idx_t n, const float *x, - uint8_t *bytes) const; - - /** encode a set of vectors - * - * @param n number of vectors - * @param bytes input encoded vectors, size n * sa_code_size() - * @param x output vectors, size n * d - */ - virtual void sa_decode (idx_t n, const uint8_t *bytes, - float *x) const; - - }; } diff --git a/src/3rd_party/faiss/IndexLSH.cpp b/src/3rd_party/faiss/IndexLSH.cpp deleted file mode 100644 index 6df843312..000000000 --- a/src/3rd_party/faiss/IndexLSH.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#include - -#include -#include - -#include - -#include -#include "common/logging.h" - - -namespace faiss { - -/*************************************************************** - * IndexLSH - ***************************************************************/ - - -IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds): - Index(d), nbits(nbits), rotate_data(rotate_data), - train_thresholds (train_thresholds), rrot(d, nbits) -{ - is_trained = !train_thresholds; - - bytes_per_vec = (nbits + 7) / 8; - - if (rotate_data) { - rrot.init(5); - } else { - ABORT_UNLESS(d >= nbits, "d >= nbits"); - } -} - -IndexLSH::IndexLSH (): - nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false) -{ -} - - -const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const -{ - - float *xt = nullptr; - if (rotate_data) { - // also applies bias if exists - xt = rrot.apply (n, x); - } else if (d != nbits) { - assert (nbits < d); - xt = new float [nbits * n]; - float *xp = xt; - for (idx_t i = 0; i < n; i++) { - const float *xl = x + i * d; - for (int j = 0; j < nbits; j++) - *xp++ = xl [j]; - } - } - - if (train_thresholds) { - - if (xt == NULL) { - xt = new float [nbits * n]; - memcpy (xt, x, sizeof(*x) * n * nbits); - } - - float *xp = xt; - for (idx_t i = 0; i < n; i++) - for (int j = 0; j < nbits; j++) - *xp++ -= thresholds [j]; - } - - return xt ? xt : x; -} - - - -void IndexLSH::train (idx_t n, const float *x) -{ - if (train_thresholds) { - thresholds.resize (nbits); - train_thresholds = false; - const float *xt = apply_preprocess (n, x); - ScopeDeleter del (xt == x ? nullptr : xt); - train_thresholds = true; - - float * transposed_x = new float [n * nbits]; - ScopeDeleter del2 (transposed_x); - - for (idx_t i = 0; i < n; i++) - for (idx_t j = 0; j < nbits; j++) - transposed_x [j * n + i] = xt [i * nbits + j]; - - for (idx_t i = 0; i < nbits; i++) { - float *xi = transposed_x + i * n; - // std::nth_element - std::sort (xi, xi + n); - if (n % 2 == 1) - thresholds [i] = xi [n / 2]; - else - thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2; - - } - } - is_trained = true; -} - - -void IndexLSH::add (idx_t n, const float *x) -{ - ABORT_UNLESS (is_trained, "is_trained"); - codes.resize ((ntotal + n) * bytes_per_vec); - - sa_encode (n, x, &codes[ntotal * bytes_per_vec]); - - ntotal += n; -} - - -void IndexLSH::search ( - idx_t n, - const float *x, - idx_t k, - float *distances, - idx_t *labels) const -{ - ABORT_UNLESS (is_trained, "is_trained"); - const float *xt = apply_preprocess (n, x); - ScopeDeleter del (xt == x ? nullptr : xt); - - uint8_t * qcodes = new uint8_t [n * bytes_per_vec]; - ScopeDeleter del2 (qcodes); - - fvecs2bitvecs (xt, qcodes, nbits, n); - - int * idistances = new int [n * k]; - ScopeDeleter del3 (idistances); - - int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances}; - - hammings_knn_hc (&res, qcodes, codes.data(), - ntotal, bytes_per_vec, true); - - - // convert distances to floats - for (int i = 0; i < k * n; i++) - distances[i] = idistances[i]; - -} - - -void IndexLSH::transfer_thresholds (LinearTransform *vt) { - if (!train_thresholds) return; - ABORT_UNLESS (nbits == vt->d_out, "nbits == vt->d_out"); - if (!vt->have_bias) { - vt->b.resize (nbits, 0); - vt->have_bias = true; - } - for (int i = 0; i < nbits; i++) - vt->b[i] -= thresholds[i]; - train_thresholds = false; - thresholds.clear(); -} - -void IndexLSH::reset() { - codes.clear(); - ntotal = 0; -} - - -size_t IndexLSH::sa_code_size () const -{ - return bytes_per_vec; -} - -void IndexLSH::sa_encode (idx_t n, const float *x, - uint8_t *bytes) const -{ - ABORT_UNLESS (is_trained, "is_trained"); - const float *xt = apply_preprocess (n, x); - ScopeDeleter del (xt == x ? nullptr : xt); - fvecs2bitvecs (xt, bytes, nbits, n); -} - -void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes, - float *x) const -{ - float *xt = x; - ScopeDeleter del; - if (rotate_data || nbits != d) { - xt = new float [n * nbits]; - del.set(xt); - } - bitvecs2fvecs (bytes, xt, nbits, n); - - if (train_thresholds) { - float *xp = xt; - for (idx_t i = 0; i < n; i++) { - for (int j = 0; j < nbits; j++) { - *xp++ += thresholds [j]; - } - } - } - - if (rotate_data) { - rrot.reverse_transform (n, xt, x); - } else if (nbits != d) { - for (idx_t i = 0; i < n; i++) { - memcpy (x + i * d, xt + i * nbits, - nbits * sizeof(xt[0])); - } - } -} - - - -} // namespace faiss diff --git a/src/3rd_party/faiss/IndexLSH.h b/src/3rd_party/faiss/IndexLSH.h deleted file mode 100644 index 66435363a..000000000 --- a/src/3rd_party/faiss/IndexLSH.h +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -// -*- c++ -*- - -#ifndef INDEX_LSH_H -#define INDEX_LSH_H - -#include - -#include -#include - -namespace faiss { - - -/** The sign of each vector component is put in a binary signature */ -struct IndexLSH:Index { - typedef unsigned char uint8_t; - - int nbits; ///< nb of bits per vector - int bytes_per_vec; ///< nb of 8-bits per encoded vector - bool rotate_data; ///< whether to apply a random rotation to input - bool train_thresholds; ///< whether we train thresholds or use 0 - - RandomRotationMatrix rrot; ///< optional random rotation - - std::vector thresholds; ///< thresholds to compare with - - /// encoded dataset - std::vector codes; - - IndexLSH ( - idx_t d, int nbits, - bool rotate_data = true, - bool train_thresholds = false); - - /** Preprocesses and resizes the input to the size required to - * binarize the data - * - * @param x input vectors, size n * d - * @return output vectors, size n * bits. May be the same pointer - * as x, otherwise it should be deleted by the caller - */ - const float *apply_preprocess (idx_t n, const float *x) const; - - void train(idx_t n, const float* x) override; - - void add(idx_t n, const float* x) override; - - void search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels) const override; - - void reset() override; - - /// transfer the thresholds to a pre-processing stage (and unset - /// train_thresholds) - void transfer_thresholds (LinearTransform * vt); - - ~IndexLSH() override {} - - IndexLSH (); - - /* standalone codec interface. - * - * The vectors are decoded to +/- 1 (not 0, 1) */ - - size_t sa_code_size () const override; - - void sa_encode (idx_t n, const float *x, - uint8_t *bytes) const override; - - void sa_decode (idx_t n, const uint8_t *bytes, - float *x) const override; - -}; - - -} - - -#endif diff --git a/src/3rd_party/faiss/utils/hamming-inl.h b/src/3rd_party/faiss/utils/hamming-inl.h index d32da7580..b164dc88e 100644 --- a/src/3rd_party/faiss/utils/hamming-inl.h +++ b/src/3rd_party/faiss/utils/hamming-inl.h @@ -10,8 +10,8 @@ namespace faiss { -#ifdef _MSC_VER -#define bzero(p,n) (memset((p),0,(n))) +#ifdef _MSC_VER +#define bzero(p,n) (memset((p),0,(n))) #endif inline BitstringWriter::BitstringWriter(uint8_t *code, int code_size): code (code), code_size (code_size), i(0) @@ -29,7 +29,7 @@ inline void BitstringWriter::write(uint64_t x, int nbit) { i += nbit; return; } else { - int j = i >> 3; + size_t j = i >> 3; code[j++] |= x << (i & 7); i += nbit; x >>= na; @@ -57,7 +57,7 @@ inline uint64_t BitstringReader::read(int nbit) { return res; } else { int ofs = na; - int j = (i >> 3) + 1; + size_t j = (i >> 3) + 1; i += nbit; nbit -= na; while (nbit > 8) { @@ -160,7 +160,7 @@ struct HammingComputer20 { void set (const uint8_t *a8, int code_size) { assert (code_size == 20); const uint64_t *a = (uint64_t *)a8; - a0 = a[0]; a1 = a[1]; a2 = a[2]; + a0 = a[0]; a1 = a[1]; a2 = (uint32_t)a[2]; } inline int hamming (const uint8_t *b8) const { diff --git a/src/3rd_party/faiss/utils/hamming.h b/src/3rd_party/faiss/utils/hamming.h index 762d3773c..0c89c4d1f 100644 --- a/src/3rd_party/faiss/utils/hamming.h +++ b/src/3rd_party/faiss/utils/hamming.h @@ -31,7 +31,7 @@ #ifdef _MSC_VER #include // needed for some intrinsics in -#define __builtin_popcountl __popcnt64 +#define __builtin_popcountl __popcnt64 #endif /* The Hamming distance type */ @@ -116,7 +116,7 @@ struct BitstringReader { extern size_t hamming_batch_size; static inline int popcount64(uint64_t x) { - return __builtin_popcountl(x); + return (int)__builtin_popcountl(x); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d2fd269f8..1f5db423f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -75,6 +75,7 @@ set(MARIAN_SOURCES layers/embedding.cpp layers/output.cpp layers/logits.cpp + layers/lsh.cpp rnn/cells.cpp rnn/attention.cpp diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index 26cac858f..e0e89d2bc 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -2,6 +2,7 @@ #include "common/cli_wrapper.h" #include "tensors/cpu/expression_graph_packable.h" #include "onnx/expression_graph_onnx_exporter.h" +#include "layers/lsh.h" #include @@ -25,6 +26,9 @@ int main(int argc, char** argv) { cli->add("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, " "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", "float32"); + cli->add>("--add-lsh", + "Encode output matrix and optional rotation matrix into model file. " + "arg1: number of bits in LSH encoding, arg2: name of output weights matrix")->implicit_val("1024 Wemb"); cli->add>("--vocabs,-V", "Vocabulary file, required for ONNX export"); cli->parse(argc, argv); options->merge(config); @@ -34,6 +38,16 @@ int main(int argc, char** argv) { auto exportAs = options->get("export-as"); auto vocabPaths = options->get>("vocabs");// , std::vector()); + + bool addLsh = options->hasAndNotEmpty("add-lsh"); + int lshNBits = 1024; + std::string lshOutputWeights = "Wemb"; + if(addLsh) { + auto lshParams = options->get>("add-lsh"); + lshNBits = std::stoi(lshParams[0]); + if(lshParams.size() > 1) + lshOutputWeights = lshParams[1]; + } // We accept any type here and will later croak during packAndSave if the type cannot be used for conversion Type saveGemmType = typeFromString(options->get("gemm-type", "float32")); @@ -45,23 +59,36 @@ int main(int argc, char** argv) { marian::io::getYamlFromModel(config, "special:model.yml", modelFrom); configStr << config; - auto load = [&](Ptr graph) { + if (exportAs == "marian-bin") { + auto graph = New(); graph->setDevice(CPU0); graph->load(modelFrom); + + if(addLsh) { + // Add dummy parameters for the LSH before the model gets actually initialized. + // This create the parameters with useless values in the tensors, but it gives us the memory we need. + graph->setReloaded(false); + lsh::addDummyParameters(graph, /*weights=*/lshOutputWeights, /*nBits=*/lshNBits); + graph->setReloaded(true); + } + graph->forward(); // run the initializers - }; + if(addLsh) { + // After initialization, hijack the paramters for the LSH and force-overwrite with correct values. + // Once this is done we can just pack and save as normal. + lsh::overwriteDummyParameters(graph, /*weights=*/lshOutputWeights); + } - if (exportAs == "marian-bin") { - auto graph = New(); - load(graph); // added a flag if the weights needs to be packed or not graph->packAndSave(modelTo, configStr.str(), /* --gemm-type */ saveGemmType, Type::float32); } else if (exportAs == "onnx-encode") { #ifdef USE_ONNX auto graph = New(); - load(graph); + graph->setDevice(CPU0); + graph->load(modelFrom); + graph->forward(); // run the initializers auto modelOptions = New(config)->with("vocabs", vocabPaths, "inference", true); graph->exportToONNX(modelTo, modelOptions, vocabPaths); diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index f7e229ffe..396c6ba49 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -1,10 +1,7 @@ #include "data/shortlist.h" #include "microsoft/shortlist/utils/ParameterTree.h" #include "marian.h" - -#if BLAS_FOUND -#include "3rd_party/faiss/IndexLSH.h" -#endif +#include "layers/lsh.h" namespace marian { namespace data { @@ -47,7 +44,6 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Shape kShape({k}); indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); - //std::cerr << "indicesExpr_=" << indicesExpr_->shape() << std::endl; createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k); initialized_ = true; } @@ -78,12 +74,10 @@ void Shortlist::createCachedTensors(Expr weights, } /////////////////////////////////////////////////////////////////////////////////// -Ptr LSHShortlist::index_; -std::mutex LSHShortlist::mutex_; LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) -: Shortlist(std::vector()) -, k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { +: Shortlist(std::vector()), + k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { } WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { @@ -99,67 +93,23 @@ Expr LSHShortlist::getIndicesExpr() const { } void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { -#if BLAS_FOUND + ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, "LSH index (--output-approx-knn) currently not implemented for GPU"); - int currBeamSize = input->shape()[0]; - int batchSize = input->shape()[2]; - int numHypos = currBeamSize * batchSize; - - auto forward = [this, numHypos](Expr out, const std::vector& inputs) { - auto query = inputs[0]; - auto values = inputs[1]; - int dim = values->shape()[-1]; - - mutex_.lock(); - if(!index_) { - LOG(info, "Building LSH index for vector dim {} and with hash size {} bits", dim, nbits_); - index_.reset(new faiss::IndexLSH(dim, nbits_, - /*rotate=*/dim != nbits_, - /*train_thesholds*/false)); - index_->train(lemmaSize_, values->val()->data()); - index_->add( lemmaSize_, values->val()->data()); - } - mutex_.unlock(); - - int qRows = query->shape().elements() / dim; - std::vector distances(qRows * k_); - std::vector ids(qRows * k_); - - index_->search(qRows, query->val()->data(), k_, - distances.data(), ids.data()); - - indices_.clear(); - for(auto iter = ids.begin(); iter != ids.end(); ++iter) { - faiss::Index::idx_t id = *iter; - indices_.push_back((WordIndex)id); - } - - for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { - size_t startIdx = k_ * hypoIdx; - size_t endIdx = startIdx + k_; - std::sort(indices_.begin() + startIdx, indices_.begin() + endIdx); - } - out->val()->set(indices_); - }; - - Shape kShape({currBeamSize, batchSize, k_}); - indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); + indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_), + [this](Expr node) { + node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node + }); createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k_); - -#else - input; weights; isLegacyUntransposedW; b; lemmaEt; - ABORT("LSH output layer requires a CPU BLAS library"); -#endif } void LSHShortlist::createCachedTensors(Expr weights, - bool isLegacyUntransposedW, - Expr b, - Expr lemmaEt, - int k) { + bool isLegacyUntransposedW, + Expr b, + Expr lemmaEt, + int k) { int currBeamSize = indicesExpr_->shape()[0]; int batchSize = indicesExpr_->shape()[1]; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index a75d2c4ba..d3841b21a 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -25,7 +25,8 @@ namespace data { class Shortlist { protected: std::vector indices_; // // [packed shortlist index] -> word index, used to select columns from output embeddings - Expr indicesExpr_; + Expr indicesExpr_; // cache an expression that contains the short list indices + Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index fce7d532f..553a5d63b 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -646,6 +646,16 @@ class ExpressionGraph : public std::enable_shared_from_this { return it->second; } + /** + * Return the Parameters object related to the graph by elementType. + * The Parameters object holds the whole set of the parameter nodes of the given type. + */ + Ptr& params(Type elementType) { + auto it = paramsByElementType_.find(elementType); + ABORT_IF(it == paramsByElementType_.end(), "Parameter object for type {} does not exist", defaultElementType_); + return it->second; + } + /** * Set default element type for the graph. * The default value is used if some node type is not specified. diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 24d12eea7..560ab4e73 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -28,13 +28,17 @@ Expr checkpoint(Expr a) { } Expr lambda(const std::vector& nodes, Shape shape, Type type, - LambdaNodeFunctor fwd) { - return Expression(nodes, shape, type, fwd); + LambdaNodeFunctor fwd, size_t hash) { + return Expression(nodes, shape, type, fwd, hash); } Expr lambda(const std::vector& nodes, Shape shape, Type type, - LambdaNodeFunctor fwd, LambdaNodeFunctor bwd) { - return Expression(nodes, shape, type, fwd, bwd); + LambdaNodeFunctor fwd, LambdaNodeFunctor bwd, size_t hash) { + return Expression(nodes, shape, type, fwd, bwd, hash); +} + +Expr callback(Expr node, LambdaNodeCallback call) { + return Expression(node, call); } // logistic function. Note: scipy name is expit() diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 6c7e5758d..e34ddc8ac 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -26,12 +26,19 @@ typedef std::function& in)> LambdaNodeFun /** * Arbitrary node with forward operation only. */ -Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd); +Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, size_t hash = 0); /** * Arbitrary node with forward and backward operation. */ -Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, LambdaNodeFunctor bwd); +Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, LambdaNodeFunctor bwd, size_t hash = 0); + + +/** + * Convience typedef for graph @ref lambda expressions. + */ +typedef std::function LambdaNodeCallback; +Expr callback(Expr node, LambdaNodeCallback call); /** * @addtogroup graph_ops_activation Activation Functions diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp index 4e39d1bf3..e44b48287 100644 --- a/src/graph/node_initializers.cpp +++ b/src/graph/node_initializers.cpp @@ -11,6 +11,15 @@ namespace marian { namespace inits { +class DummyInit : public NodeInitializer { +public: + void apply(Tensor tensor) override { + tensor; + } +}; + +Ptr dummy() { return New(); } + class LambdaInit : public NodeInitializer { private: std::function lambda_; @@ -237,24 +246,3 @@ template Ptr range(IndexType begin, IndexType end, I } // namespace inits } // namespace marian - -#if BLAS_FOUND -#include "faiss/VectorTransform.h" - -namespace marian { -namespace inits { - -Ptr randomRotation(size_t seed) { - auto rot = [=](Tensor t) { - int rows = t->shape()[-2]; - int cols = t->shape()[-1]; - faiss::RandomRotationMatrix rrot(cols, rows); // transposed in faiss - rrot.init((int)seed); - t->set(rrot.A); - }; - return fromLambda(rot, Type::float32); -} - -} // namespace inits -} // namespace marian -#endif diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h index 7cdb41831..5e9f80137 100644 --- a/src/graph/node_initializers.h +++ b/src/graph/node_initializers.h @@ -35,6 +35,11 @@ class NodeInitializer { virtual ~NodeInitializer() {} }; +/** + * Dummy do-nothing initializer. Mostly for testing. + */ +Ptr dummy(); + /** * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor. * @param func functor @@ -263,13 +268,6 @@ Ptr fromWord2vec(const std::string& file, */ Ptr sinusoidalPositionEmbeddings(int start); -/** - * Computes a random rotation matrix for LSH hashing. - * This is part of a hash function. The values are orthonormal and computed via - * QR decomposition. Same seed results in same random rotation. - */ -Ptr randomRotation(size_t seed = Config::seed); - /** * Computes the equivalent of Python's range(). * Computes a range from begin to end-1, like Python's range(). diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 169b1420b..a180bb5c8 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -21,20 +21,26 @@ class LambdaNodeOp : public NaryNodeOp { std::unique_ptr forward_; std::unique_ptr backward_; + size_t externalHash_; + public: LambdaNodeOp(Inputs inputs, Shape shape, Type type, - LambdaNodeFunctor forward) + LambdaNodeFunctor forward, + size_t externalHash = 0) : NaryNodeOp(inputs, shape, type), - forward_(new LambdaNodeFunctor(forward)) { + forward_(new LambdaNodeFunctor(forward)), + externalHash_(externalHash) { Node::trainable_ = !!backward_; } LambdaNodeOp(Inputs inputs, Shape shape, Type type, LambdaNodeFunctor forward, - LambdaNodeFunctor backward) + LambdaNodeFunctor backward, + size_t externalHash = 0) : NaryNodeOp(inputs, shape, type), forward_(new LambdaNodeFunctor(forward)), - backward_(new LambdaNodeFunctor(backward)) { + backward_(new LambdaNodeFunctor(backward)), + externalHash_(externalHash) { } void forward() override { @@ -50,8 +56,12 @@ class LambdaNodeOp : public NaryNodeOp { virtual size_t hash() override { size_t seed = NaryNodeOp::hash(); - util::hash_combine(seed, forward_.get()); - util::hash_combine(seed, backward_.get()); + if(externalHash_ != 0) { + util::hash_combine(seed, externalHash_); + } else { + util::hash_combine(seed, forward_.get()); + util::hash_combine(seed, backward_.get()); + } return seed; } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 82b02a65f..448b4c4a4 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -795,7 +795,7 @@ struct TransposeNodeOp : public UnaryNodeOp { }; class ReshapeNodeOp : public UnaryNodeOp { -private: +protected: friend class SerializationHelpers; Expr reshapee_; @@ -858,6 +858,45 @@ class ReshapeNodeOp : public UnaryNodeOp { } }; +// @TODO: add version with access to backward step +// This allows to attach a lambda function to any node during the execution. It is a non-operation otherwise +// i.e. doesn't consume any memory or take any time to execute (it's a reshape onto itself) other than the +// compute in the lambda function. It gets called after the forward step of the argument node. +class CallbackNodeOp : public ReshapeNodeOp { +private: + typedef std::function LambdaNodeCallback; + std::unique_ptr callback_; + +public: + CallbackNodeOp(Expr node, LambdaNodeCallback callback) + : ReshapeNodeOp(node, node->shape()), + callback_(new LambdaNodeCallback(callback)) { + } + + void forward() override { + (*callback_)(ReshapeNodeOp::reshapee_); + } + + const std::string type() override { return "callback"; } + + virtual size_t hash() override { + size_t seed = ReshapeNodeOp::hash(); + util::hash_combine(seed, callback_.get()); + return seed; + } + + virtual bool equal(Expr node) override { + if(!ReshapeNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(callback_ != cnode->callback_) // pointer compare on purpose + return false; + return true; + } +}; + // @TODO: review if still required as this is an ugly hack anyway. // Memory less operator that clips gradients during backward step // Executes this as an additional operation on the gradient. diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp new file mode 100644 index 000000000..89b482f4c --- /dev/null +++ b/src/layers/lsh.cpp @@ -0,0 +1,233 @@ +#include "layers/lsh.h" +#include "tensors/tensor_operators.h" +#include "common/utils.h" + +#include "3rd_party/faiss/utils/hamming.h" +#include "3rd_party/faiss/Index.h" + +#if BLAS_FOUND +#include "3rd_party/faiss/VectorTransform.h" +#endif + + +namespace marian { +namespace lsh { + +int bytesPerVector(int nBits) { + return (nBits + 7) / 8; +} + +void fillRandomRotationMatrix(Tensor output, Ptr allocator) { +#if BLAS_FOUND + int nRows = output->shape()[-2]; + int nBits = output->shape()[-1]; + + // @TODO re-implement using Marian code so it uses the correct random generator etc. + faiss::RandomRotationMatrix rrot(nRows, nBits); + // Then we do not need to use this seed at all + rrot.init(5); // currently set to 5 following the default from FAISS, this could be any number really. + + // The faiss random rotation matrix is column major, hence we create a temporary tensor, + // copy the rotation matrix into it and transpose to output. + Shape tempShape = {nBits, nRows}; + auto memory = allocator->alloc(requiredBytes(tempShape, output->type())); + auto temp = TensorBase::New(memory, + tempShape, + output->type(), + output->getBackend()); + temp->set(rrot.A); + TransposeND(output, temp, {0, 1, 3, 2}); + allocator->free(memory); +#else + output; allocator; + ABORT("LSH with rotation matrix requires Marian to be compiled with a BLAS library"); +#endif +} + +void encode(Tensor output, Tensor input) { + int nBits = input->shape()[-1]; // number of bits is equal last dimension of float matrix + int nRows = input->shape().elements() / nBits; + faiss::fvecs2bitvecs(input->data(), output->data(), (size_t)nBits, (size_t)nRows); +} + +void encodeWithRotation(Tensor output, Tensor input, Tensor rotation, Ptr allocator) { + int nBits = input->shape()[-1]; // number of bits is equal last dimension of float matrix unless we rotate + int nRows = input->shape().elements() / nBits; + + Tensor tempInput = input; + MemoryPiece::PtrType memory; + if(rotation) { + int nBitsRot = rotation->shape()[-1]; + Shape tempShape = {nRows, nBitsRot}; + memory = allocator->alloc(requiredBytes(tempShape, rotation->type())); + tempInput = TensorBase::New(memory, tempShape, rotation->type(), rotation->getBackend()); + Prod(tempInput, input, rotation, false, false, 0.f, 1.f); + } + encode(output, tempInput); + + if(memory) + allocator->free(memory); +}; + +Expr encode(Expr input, Expr rotation) { + auto encodeFwd = [](Expr out, const std::vector& inputs) { + if(inputs.size() == 1) { + encode(out->val(), inputs[0]->val()); + } else if(inputs.size() == 2) { + encodeWithRotation(out->val(), inputs[0]->val(), inputs[1]->val(), out->graph()->allocator()); + } else { + ABORT("Too many inputs to encode??"); + } + }; + + // Use the address of the first lambda function as an immutable hash. Making it static and const makes sure + // that this hash value will not change. Next pass the hash into the lambda functor were it will be used + // to identify this unique operation. Marian's ExpressionGraph can automatically memoize and identify nodes + // that operate only on immutable nodes (parameters) and have the same hash. This way we make sure that the + // codes node won't actually get recomputed throughout ExpressionGraph lifetime. `codes` will be reused + // and the body of the lambda will not be called again. This does however build one index per graph. + static const size_t encodeHash = (size_t)&encodeFwd; + + Shape encodedShape = input->shape(); + + int nBits = rotation ? rotation->shape()[-1] : input->shape()[-1]; + encodedShape.set(-1, bytesPerVector(nBits)); + std::vector inputs = {input}; + if(rotation) + inputs.push_back(rotation); + return lambda(inputs, encodedShape, Type::uint8, encodeFwd, encodeHash); +} + +Expr rotator(Expr weights, int nBits) { + auto rotator = [](Expr out, const std::vector& inputs) { + inputs; + fillRandomRotationMatrix(out->val(), out->graph()->allocator()); + }; + + static const size_t rotatorHash = (size_t)&rotator; + int dim = weights->shape()[-1]; + return lambda({weights}, {dim, nBits}, Type::float32, rotator, rotatorHash); +} + +Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows) { + ABORT_IF(encodedQuery->shape()[-1] != encodedWeights->shape()[-1], + "Query and index bit vectors need to be of same size ({} != {})", encodedQuery->shape()[-1], encodedWeights->shape()[-1]); + + int currBeamSize = encodedQuery->shape()[0]; + int batchSize = encodedQuery->shape()[2]; + int numHypos = currBeamSize * batchSize; + + auto search = [=](Expr out, const std::vector& inputs) { + Expr encodedQuery = inputs[0]; + Expr encodedWeights = inputs[1]; + + int bytesPerVector = encodedWeights->shape()[-1]; + int wRows = encodedWeights->shape().elements() / bytesPerVector; + + // we use this with Factored Segmenter to skip the factor embeddings at the end + if(firstNRows != 0) + wRows = firstNRows; + + int qRows = encodedQuery->shape().elements() / bytesPerVector; + + uint8_t* qCodes = encodedQuery->val()->data(); + uint8_t* wCodes = encodedWeights->val()->data(); + + // use actual faiss code for performing the hamming search. + std::vector distances(qRows * k); + std::vector ids(qRows * k); + faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)k, ids.data(), distances.data()}; + faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0); + + // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis. + // The sorting is required as we later do a binary search on those values for reverse look-up. + uint32_t* outData = out->val()->data(); + for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { + size_t startIdx = k * hypoIdx; + size_t endIdx = startIdx + k; + for(size_t i = startIdx; i < endIdx; ++i) + outData[i] = (uint32_t)ids[i]; + std::sort(outData + startIdx, outData + endIdx); + } + }; + + Shape kShape({currBeamSize, batchSize, k}); + return lambda({encodedQuery, encodedWeights}, kShape, Type::uint32, search); +} + +Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows) { + int dim = weights->shape()[-1]; + + Expr rotMat = nullptr; + if(dim != nBits) { + rotMat = weights->graph()->get("lsh_output_rotation"); + if(rotMat) { + LOG_ONCE(info, "Reusing parameter LSH rotation matrix {} with shape {}", rotMat->name(), rotMat->shape()); + } else { + LOG_ONCE(info, "Creating ad-hoc rotation matrix with shape {}", Shape({dim, nBits})); + rotMat = rotator(weights, nBits); + } + } + + Expr encodedWeights = weights->graph()->get("lsh_output_codes"); + if(encodedWeights) { + LOG_ONCE(info, "Reusing parameter LSH code matrix {} with shape {}", encodedWeights->name(), encodedWeights->shape()); + } else { + LOG_ONCE(info, "Creating ad-hoc code matrix with shape {}", Shape({weights->shape()[-2], lsh::bytesPerVector(nBits)})); + encodedWeights = encode(weights, rotMat); + } + + return searchEncoded(encode(query, rotMat), encodedWeights, k, firstNRows); +} + +class RandomRotation : public inits::NodeInitializer { +public: + void apply(Tensor tensor) override { + auto sharedAllocator = allocator_.lock(); + ABORT_IF(!sharedAllocator, "Allocator in RandomRotation has not been set or expired"); + fillRandomRotationMatrix(tensor, sharedAllocator); + } +}; + +Ptr randomRotation() { + return New(); +} + +void addDummyParameters(Ptr graph, std::string weightsName, int nBitsRot) { + auto weights = graph->get(weightsName); + + ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", weightsName); + + int nBits = weights->shape()[-1]; + int nRows = weights->shape().elements() / nBits; + + Expr rotation; + if(nBits != nBitsRot) { + LOG(info, "Adding LSH rotation parameter lsh_output_rotation with shape {}", Shape({nBits, nBitsRot})); + rotation = graph->param("lsh_output_rotation", {nBits, nBitsRot}, inits::dummy(), Type::float32); + nBits = nBitsRot; + } + + int bytesPerVector = lsh::bytesPerVector(nBits); + LOG(info, "Adding LSH encoded weights lsh_output_codes with shape {}", Shape({nRows, bytesPerVector})); + auto codes = graph->param("lsh_output_codes", {nRows, bytesPerVector}, inits::dummy(), Type::uint8); +} + +void overwriteDummyParameters(Ptr graph, std::string weightsName) { + Expr weights = graph->get(weightsName); + Expr codes = graph->get("lsh_output_codes"); + Expr rotation = graph->get("lsh_output_rotation"); + + ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", weightsName); + ABORT_IF(!codes, "Trying to overwrite non-existing LSH parameters lsh_output_codes??"); + + if(rotation) { + fillRandomRotationMatrix(rotation->val(), weights->graph()->allocator()); + encodeWithRotation(codes->val(), weights->val(), rotation->val(), weights->graph()->allocator()); + } else { + encode(codes->val(), weights->val()); + } +} + +} +} \ No newline at end of file diff --git a/src/layers/lsh.h b/src/layers/lsh.h new file mode 100644 index 000000000..60908238f --- /dev/null +++ b/src/layers/lsh.h @@ -0,0 +1,49 @@ +#pragma once + +#include "graph/expression_operators.h" +#include "graph/node_initializers.h" + +#include + +/** + * In this file we bascially take the faiss::IndexLSH and pick it apart so that the individual steps + * can be implemented as Marian inference operators. We can encode the inputs and weights into their + * bitwise equivalents, apply the hashing rotation (if required), and perform the actual search. + * + * This also allows to create parameters that get dumped into the model weight file. This is currently + * a bit hacky (see marian-conv), but once this is done the model can memory-map the LSH with existing + * mechanisms and no additional memory is consumed to build the index or rotation matrix. + */ + +namespace marian { +namespace lsh { + + // return the number of full bytes required to encoded that many bits + int bytesPerVector(int nBits); + + // encodes an input as a bit vector, with optional rotation + Expr encode(Expr input, Expr rotator = nullptr); + + // compute the rotation matrix (maps weights->shape()[-1] to nbits floats) + Expr rotator(Expr weights, int nbits); + + // perform the LSH search on fully encoded input and weights, return k results (indices) per input row + // @TODO: add a top-k like operator that also returns the bitwise computed distances + Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows = 0); + + // same as above, but performs encoding on the fly + Expr search(Expr query, Expr weights, int k, int nbits, int firstNRows = 0); + + // These are helper functions for encoding the LSH into the binary Marian model, used by marian-conv + void addDummyParameters(Ptr graph, std::string weightsName, int nBits); + void overwriteDummyParameters(Ptr graph, std::string weightsName); + + /** + * Computes a random rotation matrix for LSH hashing. + * This is part of a hash function. The values are orthonormal and computed via + * QR decomposition. + */ + Ptr randomRotation(); +} + +} \ No newline at end of file diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 6476df8fe..70e657a93 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -11,6 +11,7 @@ #include "data/alignment.h" #include "data/vocab_base.h" #include "tensors/cpu/expression_graph_packable.h" +#include "layers/lsh.h" #if USE_FBGEMM #include "fbgemm/Utils.h" @@ -248,7 +249,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) { // This function converts an fp32 model into an FBGEMM based packed model. // marian defined types are used for external project as well. // The targetPrec is passed as int32_t for the exported function definition. -bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) { +bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, bool addLsh) { std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl; YAML::Node config; @@ -260,7 +261,26 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP graph->setDevice(CPU0); graph->load(inputFile); - graph->forward(); + + // MJD: Note, this is a default settings which we might want to change or expose. Use this only with Polonium students. + // The LSH will not be used by default even if it exists in the model. That has to be enabled in the decoder config. + int lshNBits = 1024; + std::string lshOutputWeights = "Wemb"; + if(addLsh) { + // Add dummy parameters for the LSH before the model gets actually initialized. + // This create the parameters with useless values in the tensors, but it gives us the memory we need. + graph->setReloaded(false); + lsh::addDummyParameters(graph, /*weights=*/lshOutputWeights, /*nBits=*/lshNBits); + graph->setReloaded(true); + } + + graph->forward(); // run the initializers + + if(addLsh) { + // After initialization, hijack the paramters for the LSH and force-overwrite with correct values. + // Once this is done we can just pack and save as normal. + lsh::overwriteDummyParameters(graph, /*weights=*/lshOutputWeights); + } Type targetPrecType = (Type) targetPrec; if (targetPrecType == Type::packed16 diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h index 87de19482..b710e1352 100644 --- a/src/microsoft/quicksand.h +++ b/src/microsoft/quicksand.h @@ -76,7 +76,10 @@ std::vector> loadVocabs(const std::vector& vocab DecoderCpuAvxVersion getCpuAvxVersion(); DecoderCpuAvxVersion parseCpuAvxVersion(std::string name); -bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec); +// MJD: added "addLsh" which will now break whatever compilation after update. That's on purpose. +// The calling code should be adapted, not this interface. If you need to fix things in QS because of this +// talk to me first! +bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, bool addLsh); } // namespace quicksand } // namespace marian diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index 689aa3b18..f5a9cad9c 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -27,14 +27,17 @@ class ExpressionGraphPackable : public ExpressionGraph { virtual ~ExpressionGraphPackable() {} // Convert model weights into packed format and save to IO items. - // @TODO: review this - void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { + std::vector pack(Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { std::vector ioItems; + // handle packable parameters first (a float32 parameter is packable) + auto packableParameters = paramsByElementType_[Type::float32]; // sorted by name in std::map - for (auto p : params()->getMap()) { + for (auto p : packableParameters->getMap()) { std::string pName = p.first; + LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type()); + if (!namespace_.empty()) { if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") pName = pName.substr(namespace_.size() + 2); @@ -257,6 +260,33 @@ class ExpressionGraphPackable : public ExpressionGraph { } } + // Now handle all non-float32 parameters + for(auto& iter : paramsByElementType_) { + auto type = iter.first; + if(type == Type::float32) + continue; + + for (auto p : iter.second->getMap()) { + std::string pName = p.first; + LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type()); + + if (!namespace_.empty()) { + if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") + pName = pName.substr(namespace_.size() + 2); + } + + Tensor val = p.second->val(); + io::Item item; + val->get(item, pName); + ioItems.emplace_back(std::move(item)); + } + } + + return ioItems; + } + + void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { + auto ioItems = pack(gemmElementType, saveElementType); if (!meta.empty()) io::addMetaToItems(meta, "special:model.yml", ioItems); io::saveItems(name, ioItems); diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index 10c3e7f19..a70714043 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -35,7 +35,8 @@ class TensorBase { ENABLE_INTRUSIVE_PTR(TensorBase) - // Constructors are private, use TensorBase::New(...) +protected: + // Constructors are protected, use TensorBase::New(...) TensorBase(MemoryPiece::PtrType memory, Shape shape, Type type, @@ -61,10 +62,10 @@ class TensorBase { shape_(shape), type_(type), backend_(backend) {} public: - // Use this whenever pointing to MemoryPiece + // Use this whenever pointing to TensorBase typedef IPtr PtrType; - // Use this whenever creating a pointer to MemoryPiece + // Use this whenever creating a pointer to TensorBase template static PtrType New(Args&& ...args) { return PtrType(new TensorBase(std::forward(args)...)); diff --git a/src/training/training_state.h b/src/training/training_state.h index e0c1ba5df..ce0895a24 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -142,8 +142,9 @@ class TrainingState { // for periods. bool enteredNewPeriodOf(std::string schedulingParam) const { auto period = SchedulingParameter::parse(schedulingParam); + // @TODO: adapt to logical epochs ABORT_IF(period.unit == SchedulingUnit::epochs, - "Unit {} is not supported for frequency parameters (the one(s) with value {})", + "Unit {} is not supported for frequency parameters", schedulingParam); auto previousProgress = getPreviousProgressIn(period.unit); auto progress = getProgressIn(period.unit); From 3a478fc47d387ab339f4fe05abd9c0741c026cce Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 9 Jul 2021 13:46:18 -0700 Subject: [PATCH 097/254] update version and changelog --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cb7c305b..b95ffb5ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Adds option --add-lsh to marian-conv which allows the LSH to be memory-mapped. - Early stopping based on first, all, or any validation metrics via `--early-stopping-on` - Compute 8.6 support if using CUDA>=11.1 - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. @@ -45,6 +46,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Broken links to MNIST data sets ### Changed +- Optimize LSH for speed by treating is as a shortlist generator. No option changes in decoder - Set REQUIRED_BIAS_ALIGNMENT = 16 in tensors/gpu/prod.cpp to avoid memory-misalignment on certain Ampere GPUs. - For BUILD_ARCH != native enable all intrinsics types by default, can be disabled like this: -DCOMPILE_AVX512=off - Moved FBGEMM pointer to commit c258054 for gcc 9.3+ fix diff --git a/VERSION b/VERSION index e7f4fc036..90c7aba70 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.21 +v1.10.23 From 7e6ea51841025d5abdb6fdb1fc33dc4907355dc9 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 10 Jul 2021 08:36:18 -0700 Subject: [PATCH 098/254] silence unreferenced formal parameter warning on windows --- src/3rd_party/faiss/utils/hamming-inl.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/3rd_party/faiss/utils/hamming-inl.h b/src/3rd_party/faiss/utils/hamming-inl.h index b164dc88e..b6aaa3385 100644 --- a/src/3rd_party/faiss/utils/hamming-inl.h +++ b/src/3rd_party/faiss/utils/hamming-inl.h @@ -92,6 +92,7 @@ struct HammingComputer4 { } void set (const uint8_t *a, int code_size) { + code_size; assert (code_size == 4); a0 = *(uint32_t *)a; } @@ -112,6 +113,7 @@ struct HammingComputer8 { } void set (const uint8_t *a, int code_size) { + code_size; assert (code_size == 8); a0 = *(uint64_t *)a; } @@ -133,6 +135,7 @@ struct HammingComputer16 { } void set (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 16); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; @@ -158,6 +161,7 @@ struct HammingComputer20 { } void set (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 20); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; a2 = (uint32_t)a[2]; @@ -180,6 +184,7 @@ struct HammingComputer32 { } void set (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 32); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; @@ -203,6 +208,7 @@ struct HammingComputer64 { } void set (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 64); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; @@ -255,6 +261,7 @@ struct HammingComputerM8 { } void set (const uint8_t *a8, int code_size) { + code_size; assert (code_size % 8 == 0); a = (uint64_t *)a8; n = code_size / 8; @@ -282,6 +289,7 @@ struct HammingComputerM4 { } void set (const uint8_t *a4, int code_size) { + code_size; assert (code_size % 4 == 0); a = (uint32_t *)a4; n = code_size / 4; @@ -344,6 +352,7 @@ struct GenHammingComputer8 { uint64_t a0; GenHammingComputer8 (const uint8_t *a, int code_size) { + code_size; assert (code_size == 8); a0 = *(uint64_t *)a; } @@ -358,6 +367,7 @@ struct GenHammingComputer8 { struct GenHammingComputer16 { uint64_t a0, a1; GenHammingComputer16 (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 16); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; @@ -375,6 +385,7 @@ struct GenHammingComputer32 { uint64_t a0, a1, a2, a3; GenHammingComputer32 (const uint8_t *a8, int code_size) { + code_size; assert (code_size == 32); const uint64_t *a = (uint64_t *)a8; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; @@ -395,6 +406,7 @@ struct GenHammingComputerM8 { int n; GenHammingComputerM8 (const uint8_t *a8, int code_size) { + code_size; assert (code_size % 8 == 0); a = (uint64_t *)a8; n = code_size / 8; From 42f0b8b74bba16fed646c8af7b2f75e02af7a85c Mon Sep 17 00:00:00 2001 From: Qianqian Zhu Date: Sun, 11 Jul 2021 06:56:58 +0100 Subject: [PATCH 099/254] Binary shortlist (#856) Co-authored-by: Kenneth Heafield --- CHANGELOG.md | 2 + src/command/marian_conv.cpp | 26 +++- src/common/hash.h | 14 +- src/data/shortlist.cpp | 260 +++++++++++++++++++++++++++++++++++- src/data/shortlist.h | 74 +++++++++- src/translator/translator.h | 10 +- 6 files changed, 372 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b95ffb5ee..a9e24f573 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add unit tests for binary files. - Fix compilation with OMP - Compute aligned memory sizes using exact sizing +- Support for loading lexical shortlist from a binary blob +- Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed - Added support to MPIWrappest::bcast (and similar) for count of type size_t diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index e0e89d2bc..943f61d48 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -3,7 +3,7 @@ #include "tensors/cpu/expression_graph_packable.h" #include "onnx/expression_graph_onnx_exporter.h" #include "layers/lsh.h" - +#include "data/shortlist.h" #include int main(int argc, char** argv) { @@ -16,7 +16,8 @@ int main(int argc, char** argv) { YAML::Node config; // @TODO: get rid of YAML::Node here entirely to avoid the pattern. Currently not fixing as it requires more changes to the Options object. auto cli = New( config, - "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout", + "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout\n" + "or convert a text lexical shortlist to a binary shortlist with {--shortlist,-s} option", "Allowed options", "Examples:\n" " ./marian-conv -f model.npz -t model.bin --gemm-type packed16"); @@ -30,9 +31,30 @@ int main(int argc, char** argv) { "Encode output matrix and optional rotation matrix into model file. " "arg1: number of bits in LSH encoding, arg2: name of output weights matrix")->implicit_val("1024 Wemb"); cli->add>("--vocabs,-V", "Vocabulary file, required for ONNX export"); + cli->add>("--shortlist,-s", "Shortlist conversion: filePath firstNum bestNum threshold"); + cli->add("--dump-shortlist,-d", "Binary shortlist dump path","lex.bin"); cli->parse(argc, argv); options->merge(config); } + + // shortlist conversion: + // ./marian-conv --shortlist lex.esen.s2t 100 100 0 --dump-shortlist lex.esen.bin --vocabs vocab.esen.spm vocab.esen.spm + if(options->hasAndNotEmpty("shortlist")){ + auto vocabPaths = options->get>("vocabs"); + auto dumpPath = options->get("dump-shortlist"); + + Ptr srcVocab = New(options, 0); + srcVocab->load(vocabPaths[0]); + Ptr trgVocab = New(options, 1); + trgVocab->load(vocabPaths[1]); + + Ptr binaryShortlistGenerator + = New(options, srcVocab, trgVocab, 0, 1, vocabPaths[0] == vocabPaths[1]); + binaryShortlistGenerator->dump(dumpPath); + LOG(info, "Dumping of the shortlist is finished"); + return 0; + } + auto modelFrom = options->get("from"); auto modelTo = options->get("to"); diff --git a/src/common/hash.h b/src/common/hash.h index 1b24dbe27..7aca30de2 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -10,20 +10,20 @@ template using hash = std::hash; // This combinator is based on boost::hash_combine, but uses // std::hash as the hash implementation. Used as a drop-in // replacement for boost::hash_combine. -template -inline void hash_combine(std::size_t& seed, T const& v) { +template +inline void hash_combine(HashType& seed, T const& v) { hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); + seed ^= static_cast(hasher(v)) + 0x9e3779b9 + (seed<<6) + (seed>>2); } // Hash a whole chunk of memory, mostly used for diagnostics -template -inline size_t hashMem(const T* beg, size_t len) { - size_t seed = 0; +template +inline HashType hashMem(const T* beg, size_t len) { + HashType seed = 0; for(auto it = beg; it < beg + len; ++it) hash_combine(seed, *it); return seed; } } -} \ No newline at end of file +} diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 396c6ba49..79d685e0e 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -3,6 +3,8 @@ #include "marian.h" #include "layers/lsh.h" +#include + namespace marian { namespace data { @@ -279,7 +281,9 @@ Ptr createShortlistGenerator(Ptr options, std::vector vals = options->get>("shortlist"); ABORT_IF(vals.empty(), "No path to shortlist given"); std::string fname = vals[0]; - if(filesystem::Path(fname).extension().string() == ".bin") { + if(isBinaryShortlist(fname)){ + return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); + } else if(filesystem::Path(fname).extension().string() == ".bin") { return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); } else { return New(options, srcVocab, trgVocab, srcIdx, trgIdx, shared); @@ -287,5 +291,259 @@ Ptr createShortlistGenerator(Ptr options, } } +bool isBinaryShortlist(const std::string& fileName){ + uint64_t magic; + io::InputFileStream in(fileName); + in.read((char*)(&magic), sizeof(magic)); + return in && (magic == BINARY_SHORTLIST_MAGIC); +} + +void BinaryShortlistGenerator::contentCheck() { + bool failFlag = 0; + // The offset table has to be within the size of shortlists. + for(int i = 0; i < wordToOffsetSize_-1; i++) + failFlag |= wordToOffset_[i] >= shortListsSize_; + + // The last element of wordToOffset_ must equal shortListsSize_ + failFlag |= wordToOffset_[wordToOffsetSize_-1] != shortListsSize_; + + // The vocabulary indices have to be within the vocabulary size. + size_t vSize = trgVocab_->size(); + for(int j = 0; j < shortListsSize_; j++) + failFlag |= shortLists_[j] >= vSize; + ABORT_IF(failFlag, "Error: shortlist indices are out of bounds"); +} + +// load shortlist from buffer +void BinaryShortlistGenerator::load(const void* ptr_void, size_t blobSize, bool check /*= true*/) { + /* File layout: + * header + * wordToOffset array + * shortLists array + */ + ABORT_IF(blobSize < sizeof(Header), "Shortlist length {} too short to have a header", blobSize); + + const char *ptr = static_cast(ptr_void); + const Header &header = *reinterpret_cast(ptr); + ptr += sizeof(Header); + ABORT_IF(header.magic != BINARY_SHORTLIST_MAGIC, "Incorrect magic in binary shortlist"); + + uint64_t expectedSize = sizeof(Header) + header.wordToOffsetSize * sizeof(uint64_t) + header.shortListsSize * sizeof(WordIndex); + ABORT_IF(expectedSize != blobSize, "Shortlist header claims file size should be {} but file is {}", expectedSize, blobSize); + + if (check) { + uint64_t checksumActual = util::hashMem(&header.firstNum, (blobSize - sizeof(header.magic) - sizeof(header.checksum)) / sizeof(uint64_t)); + ABORT_IF(checksumActual != header.checksum, "checksum check failed: this binary shortlist is corrupted"); + } + + firstNum_ = header.firstNum; + bestNum_ = header.bestNum; + LOG(info, "[data] Lexical short list firstNum {} and bestNum {}", firstNum_, bestNum_); + + wordToOffsetSize_ = header.wordToOffsetSize; + shortListsSize_ = header.shortListsSize; + + // Offsets right after header. + wordToOffset_ = reinterpret_cast(ptr); + ptr += wordToOffsetSize_ * sizeof(uint64_t); + + shortLists_ = reinterpret_cast(ptr); + + // Verify offsets and vocab ids are within bounds if requested by user. + if(check) + contentCheck(); +} + +// load shortlist from file +void BinaryShortlistGenerator::load(const std::string& filename, bool check /*=true*/) { + std::error_code error; + mmapMem_.map(filename, error); + ABORT_IF(error, "Error mapping file: {}", error.message()); + load(mmapMem_.data(), mmapMem_.mapped_length(), check); +} + +BinaryShortlistGenerator::BinaryShortlistGenerator(Ptr options, + Ptr srcVocab, + Ptr trgVocab, + size_t srcIdx /*= 0*/, + size_t /*trgIdx = 1*/, + bool shared /*= false*/) + : options_(options), + srcVocab_(srcVocab), + trgVocab_(trgVocab), + srcIdx_(srcIdx), + shared_(shared) { + + std::vector vals = options_->get>("shortlist"); + ABORT_IF(vals.empty(), "No path to shortlist file given"); + std::string fname = vals[0]; + + if(isBinaryShortlist(fname)){ + bool check = vals.size() > 1 ? std::stoi(vals[1]) : 1; + LOG(info, "[data] Loading binary shortlist as {} {}", fname, check); + load(fname, check); + } + else{ + firstNum_ = vals.size() > 1 ? std::stoi(vals[1]) : 100; + bestNum_ = vals.size() > 2 ? std::stoi(vals[2]) : 100; + float threshold = vals.size() > 3 ? std::stof(vals[3]) : 0; + LOG(info, "[data] Importing text lexical shortlist as {} {} {} {}", + fname, firstNum_, bestNum_, threshold); + import(fname, threshold); + } +} + +BinaryShortlistGenerator::BinaryShortlistGenerator(const void *ptr_void, + const size_t blobSize, + Ptr srcVocab, + Ptr trgVocab, + size_t srcIdx /*= 0*/, + size_t /*trgIdx = 1*/, + bool shared /*= false*/, + bool check /*= true*/) + : srcVocab_(srcVocab), + trgVocab_(trgVocab), + srcIdx_(srcIdx), + shared_(shared) { + load(ptr_void, blobSize, check); +} + +Ptr BinaryShortlistGenerator::generate(Ptr batch) const { + auto srcBatch = (*batch)[srcIdx_]; + size_t srcVocabSize = srcVocab_->size(); + size_t trgVocabSize = trgVocab_->size(); + + // Since V=trgVocab_->size() is not large, anchor the time and space complexity to O(V). + // Attempt to squeeze the truth tables into CPU cache + std::vector srcTruthTable(srcVocabSize, 0); // holds selected source words + std::vector trgTruthTable(trgVocabSize, 0); // holds selected target words + + // add firstNum most frequent words + for(WordIndex i = 0; i < firstNum_ && i < trgVocabSize; ++i) + trgTruthTable[i] = 1; + + // collect unique words from source + // add aligned target words: mark trgTruthTable[word] to 1 + for(auto word : srcBatch->data()) { + WordIndex srcIndex = word.toWordIndex(); + if(shared_) + trgTruthTable[srcIndex] = 1; + // If srcIndex has not been encountered, add the corresponding target words + if (!srcTruthTable[srcIndex]) { + for (uint64_t j = wordToOffset_[srcIndex]; j < wordToOffset_[srcIndex+1]; j++) + trgTruthTable[shortLists_[j]] = 1; + srcTruthTable[srcIndex] = 1; + } + } + + // Due to the 'multiple-of-eight' issue, the following O(N) patch is inserted + size_t trgTruthTableOnes = 0; // counter for no. of selected target words + for (size_t i = 0; i < trgVocabSize; i++) { + if(trgTruthTable[i]) + trgTruthTableOnes++; + } + + // Ensure that the generated vocabulary items from a shortlist are a multiple-of-eight + // This is necessary until intgemm supports non-multiple-of-eight matrices. + for (size_t i = firstNum_; i < trgVocabSize && trgTruthTableOnes%8!=0; i++){ + if (!trgTruthTable[i]){ + trgTruthTable[i] = 1; + trgTruthTableOnes++; + } + } + + // turn selected indices into vector and sort (Bucket sort: O(V)) + std::vector indices; + for (WordIndex i = 0; i < trgVocabSize; i++) { + if(trgTruthTable[i]) + indices.push_back(i); + } + + return New(indices); +} + +void BinaryShortlistGenerator::dump(const std::string& fileName) const { + ABORT_IF(mmapMem_.is_open(),"No need to dump again"); + LOG(info, "[data] Saving binary shortlist dump to {}", fileName); + saveBlobToFile(fileName); +} + +void BinaryShortlistGenerator::import(const std::string& filename, double threshold) { + io::InputFileStream in(filename); + std::string src, trg; + + // Read text file + std::vector> srcTgtProbTable(srcVocab_->size()); + float prob; + + while(in >> trg >> src >> prob) { + if(src == "NULL" || trg == "NULL") + continue; + + auto sId = (*srcVocab_)[src].toWordIndex(); + auto tId = (*trgVocab_)[trg].toWordIndex(); + + if(srcTgtProbTable[sId][tId] < prob) + srcTgtProbTable[sId][tId] = prob; + } + + // Create priority queue and count + std::vector>> vpq; + uint64_t shortListsSize = 0; + + vpq.resize(srcTgtProbTable.size()); + for(WordIndex sId = 0; sId < srcTgtProbTable.size(); sId++) { + uint64_t shortListsSizeCurrent = 0; + for(auto entry : srcTgtProbTable[sId]) { + if (entry.first>=threshold) { + vpq[sId].push(std::make_pair(entry.second, entry.first)); + if(shortListsSizeCurrent < bestNum_) + shortListsSizeCurrent++; + } + } + shortListsSize += shortListsSizeCurrent; + } + + wordToOffsetSize_ = vpq.size() + 1; + shortListsSize_ = shortListsSize; + + // Generate a binary blob + blob_.resize(sizeof(Header) + wordToOffsetSize_ * sizeof(uint64_t) + shortListsSize_ * sizeof(WordIndex)); + struct Header* pHeader = (struct Header *)blob_.data(); + pHeader->magic = BINARY_SHORTLIST_MAGIC; + pHeader->firstNum = firstNum_; + pHeader->bestNum = bestNum_; + pHeader->wordToOffsetSize = wordToOffsetSize_; + pHeader->shortListsSize = shortListsSize_; + uint64_t* wordToOffset = (uint64_t*)((char *)pHeader + sizeof(Header)); + WordIndex* shortLists = (WordIndex*)((char*)wordToOffset + wordToOffsetSize_*sizeof(uint64_t)); + + uint64_t shortlistIdx = 0; + for (size_t i = 0; i < wordToOffsetSize_ - 1; i++) { + wordToOffset[i] = shortlistIdx; + for(int popcnt = 0; popcnt < bestNum_ && !vpq[i].empty(); popcnt++) { + shortLists[shortlistIdx] = vpq[i].top().second; + shortlistIdx++; + vpq[i].pop(); + } + } + wordToOffset[wordToOffsetSize_-1] = shortlistIdx; + + // Sort word indices for each shortlist + for(int i = 1; i < wordToOffsetSize_; i++) { + std::sort(&shortLists[wordToOffset[i-1]], &shortLists[wordToOffset[i]]); + } + pHeader->checksum = (uint64_t)util::hashMem((uint64_t *)blob_.data()+2, + blob_.size()/sizeof(uint64_t)-2); + + wordToOffset_ = wordToOffset; + shortLists_ = shortLists; +} + +void BinaryShortlistGenerator::saveBlobToFile(const std::string& fileName) const { + io::OutputFileStream outTop(fileName); + outTop.write(blob_.data(), blob_.size()); +} + } // namespace data } // namespace marian diff --git a/src/data/shortlist.h b/src/data/shortlist.h index d3841b21a..f15e54555 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -256,7 +256,6 @@ class LexicalShortlistGenerator : public ShortlistGenerator { bestNum_ = vals.size() > 2 ? std::stoi(vals[2]) : 100; float threshold = vals.size() > 3 ? std::stof(vals[3]) : 0; std::string dumpPath = vals.size() > 4 ? vals[4] : ""; - LOG(info, "[data] Loading lexical shortlist as {} {} {} {}", fname, @@ -392,5 +391,78 @@ Ptr createShortlistGenerator(Ptr options, size_t trgIdx = 1, bool shared = false); +// Magic signature for binary shortlist: +// ASCII and Unicode text files never start with the following 64 bits +const uint64_t BINARY_SHORTLIST_MAGIC = 0xF11A48D5013417F5; + +bool isBinaryShortlist(const std::string& fileName); + +class BinaryShortlistGenerator : public ShortlistGenerator { +private: + Ptr options_; + Ptr srcVocab_; + Ptr trgVocab_; + + size_t srcIdx_; + bool shared_{false}; + + uint64_t firstNum_{100}; // baked into binary header + uint64_t bestNum_{100}; // baked into binary header + + // shortlist is stored in a skip list + // [&shortLists_[wordToOffset_[word]], &shortLists_[wordToOffset_[word+1]]) + // is a sorted array of word indices in the shortlist for word + mio::mmap_source mmapMem_; + uint64_t wordToOffsetSize_; + uint64_t shortListsSize_; + const uint64_t *wordToOffset_; + const WordIndex *shortLists_; + std::vector blob_; // binary blob + + struct Header { + uint64_t magic; // BINARY_SHORTLIST_MAGIC + uint64_t checksum; // util::hashMem from &firstNum to end of file. + uint64_t firstNum; // Limits used to create the shortlist. + uint64_t bestNum; + uint64_t wordToOffsetSize; // Length of wordToOffset_ array. + uint64_t shortListsSize; // Length of shortLists_ array. + }; + + void contentCheck(); + // load shortlist from buffer + void load(const void* ptr_void, size_t blobSize, bool check = true); + // load shortlist from file + void load(const std::string& filename, bool check=true); + // import text shortlist from file + void import(const std::string& filename, double threshold); + // save blob to file (called by dump) + void saveBlobToFile(const std::string& filename) const; + +public: + BinaryShortlistGenerator(Ptr options, + Ptr srcVocab, + Ptr trgVocab, + size_t srcIdx = 0, + size_t /*trgIdx*/ = 1, + bool shared = false); + + // construct directly from buffer + BinaryShortlistGenerator(const void* ptr_void, + const size_t blobSize, + Ptr srcVocab, + Ptr trgVocab, + size_t srcIdx = 0, + size_t /*trgIdx*/ = 1, + bool shared = false, + bool check = true); + + ~BinaryShortlistGenerator(){ + mmapMem_.unmap(); + } + + virtual Ptr generate(Ptr batch) const override; + virtual void dump(const std::string& fileName) const override; +}; + } // namespace data } // namespace marian diff --git a/src/translator/translator.h b/src/translator/translator.h index 8cc301b45..0829f98e6 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -274,11 +274,15 @@ class TranslateService : public ModelServiceTask { trgVocab_ = New(options_, vocabPaths.size() - 1); trgVocab_->load(vocabPaths.back()); + auto srcVocab = srcVocabs_.front(); + + std::vector lshOpts = options_->get>("output-approx-knn"); + ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); // load lexical shortlist - if(options_->hasAndNotEmpty("shortlist")) - shortlistGenerator_ = New( - options_, srcVocabs_.front(), trgVocab_, 0, 1, vocabPaths.front() == vocabPaths.back()); + if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) { + shortlistGenerator_ = data::createShortlistGenerator(options_, srcVocab, trgVocab_, lshOpts, 0, 1, vocabPaths.front() == vocabPaths.back()); + } // get device IDs auto devices = Config::getDevices(options_); From 8e88071ae8caa89e3926c0d7281d8a59897e222c Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Fri, 16 Jul 2021 20:04:16 +0000 Subject: [PATCH 100/254] Merged PR 19842: Adapt LSH to work with Leaf Small changes to make the LSH work with Leaf server and QuickSand. --- CHANGELOG.md | 1 + VERSION | 2 +- src/3rd_party/sentencepiece | 2 +- src/common/logging.h | 6 +++++ src/data/shortlist.cpp | 16 +++++------ src/data/shortlist.h | 8 ++++-- src/layers/lsh.cpp | 4 ++- src/layers/lsh.h | 2 +- src/microsoft/quicksand.cpp | 27 +++++++++++++++---- src/microsoft/quicksand.h | 2 +- src/microsoft/shortlist/utils/Converter.cpp | 2 ++ src/microsoft/shortlist/utils/Converter.h | 2 ++ .../shortlist/utils/ParameterTree.cpp | 3 ++- src/microsoft/shortlist/utils/ParameterTree.h | 2 ++ src/microsoft/shortlist/utils/StringUtils.cpp | 2 ++ src/microsoft/shortlist/utils/StringUtils.h | 2 ++ src/translator/translator.h | 2 +- 17 files changed, 63 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9e24f573..1a0b99272 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed +- Various fixes to enable LSH in Quicksand - Added support to MPIWrappest::bcast (and similar) for count of type size_t - Adding new validation metrics when training is restarted and --reset-valid-stalled is used - Missing depth-scaling in transformer FFN diff --git a/VERSION b/VERSION index 90c7aba70..3c40cf565 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.23 +v1.10.24 diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 5bafa8e8c..28f9eb890 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 5bafa8e8c3391bbe9721a16e986408341f95774c +Subproject commit 28f9eb890f62907406c629acd2f04ca9b71442c9 diff --git a/src/common/logging.h b/src/common/logging.h index 776c45ebe..855bda90d 100644 --- a/src/common/logging.h +++ b/src/common/logging.h @@ -4,6 +4,8 @@ #include "spdlog/spdlog.h" +// set to 1 to use for debugging if no loggers can be created +#define LOG_TO_STDERR 0 namespace marian { void logCallStack(size_t skipLevels); @@ -149,6 +151,9 @@ class Config; template void checkedLog(std::string logger, std::string level, Args... args) { +#if LOG_TO_STDERR + std::cerr << "[" << level << "] " << fmt::format(args...) << std::endl; +#else Logger log = spdlog::get(logger); if(!log) { return; @@ -169,6 +174,7 @@ void checkedLog(std::string logger, std::string level, Args... args) { else { log->warn("Unknown log level '{}' for logger '{}'", level, logger); } +#endif } void createLoggers(const marian::Config* options = nullptr); diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 79d685e0e..da5a6572f 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -77,9 +77,9 @@ void Shortlist::createCachedTensors(Expr weights, /////////////////////////////////////////////////////////////////////////////////// -LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize) +LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) : Shortlist(std::vector()), - k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { + k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) { } WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { @@ -99,7 +99,7 @@ void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, "LSH index (--output-approx-knn) currently not implemented for GPU"); - indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_), + indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_), [this](Expr node) { node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node }); @@ -135,12 +135,12 @@ void LSHShortlist::createCachedTensors(Expr weights, } } -LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize) - : k_(k), nbits_(nbits), lemmaSize_(lemmaSize) { +LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) + : k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) { } Ptr LSHShortlistGenerator::generate(Ptr batch) const { - return New(k_, nbits_, lemmaSize_); + return New(k_, nbits_, lemmaSize_, abortIfDynamic_); } ////////////////////////////////////////////////////////////////////////////////////// @@ -175,7 +175,7 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, int32_t header_magic_number = *get(current); ABORT_IF(header_magic_number != MAGIC_NUMBER, "Trying to mmap Quicksand shortlist but encountered wrong magic number"); - auto config = ::quicksand::ParameterTree::FromBinaryReader(current); + auto config = marian::quicksand::ParameterTree::FromBinaryReader(current); use16bit_ = config->GetBoolReq("use_16_bit"); LOG(info, "[data] Mapping Quicksand shortlist from {}", fname); @@ -275,7 +275,7 @@ Ptr createShortlistGenerator(Ptr options, if (lshOpts.size()) { assert(lshOpts.size() == 2); size_t lemmaSize = trgVocab->lemmaSize(); - return New(lshOpts[0], lshOpts[1], lemmaSize); + return New(lshOpts[0], lshOpts[1], lemmaSize, /*abortIfDynamic=*/false); } else { std::vector vals = options->get>("shortlist"); diff --git a/src/data/shortlist.h b/src/data/shortlist.h index f15e54555..6cfb650de 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -74,6 +74,8 @@ class LSHShortlist: public Shortlist { int k_; // number of candidates returned from each input int nbits_; // length of hash size_t lemmaSize_; // vocab size + bool abortIfDynamic_; // if true disallow dynamic allocation for encoded weights and rotation matrix (only allow use of pre-allocated parameters) + static Ptr index_; // LSH index to store all possible candidates static std::mutex mutex_; @@ -84,7 +86,7 @@ class LSHShortlist: public Shortlist { int k); public: - LSHShortlist(int k, int nbits, size_t lemmaSize); + LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic = false); virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; @@ -97,8 +99,10 @@ class LSHShortlistGenerator : public ShortlistGenerator { int k_; int nbits_; size_t lemmaSize_; + bool abortIfDynamic_; + public: - LSHShortlistGenerator(int k, int nbits, size_t lemmaSize); + LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic = false); Ptr generate(Ptr batch) const override; }; diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index 89b482f4c..8a9c924ee 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -155,7 +155,7 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows return lambda({encodedQuery, encodedWeights}, kShape, Type::uint32, search); } -Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows) { +Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows, bool abortIfDynamic) { int dim = weights->shape()[-1]; Expr rotMat = nullptr; @@ -164,6 +164,7 @@ Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows) { if(rotMat) { LOG_ONCE(info, "Reusing parameter LSH rotation matrix {} with shape {}", rotMat->name(), rotMat->shape()); } else { + ABORT_IF(abortIfDynamic, "Dynamic creation of LSH rotation matrix prohibited"); LOG_ONCE(info, "Creating ad-hoc rotation matrix with shape {}", Shape({dim, nBits})); rotMat = rotator(weights, nBits); } @@ -173,6 +174,7 @@ Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows) { if(encodedWeights) { LOG_ONCE(info, "Reusing parameter LSH code matrix {} with shape {}", encodedWeights->name(), encodedWeights->shape()); } else { + ABORT_IF(abortIfDynamic, "Dynamic creation of LSH code matrix prohibited"); LOG_ONCE(info, "Creating ad-hoc code matrix with shape {}", Shape({weights->shape()[-2], lsh::bytesPerVector(nBits)})); encodedWeights = encode(weights, rotMat); } diff --git a/src/layers/lsh.h b/src/layers/lsh.h index 60908238f..7a5858914 100644 --- a/src/layers/lsh.h +++ b/src/layers/lsh.h @@ -32,7 +32,7 @@ namespace lsh { Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows = 0); // same as above, but performs encoding on the fly - Expr search(Expr query, Expr weights, int k, int nbits, int firstNRows = 0); + Expr search(Expr query, Expr weights, int k, int nbits, int firstNRows = 0, bool abortIfDynamic = false); // These are helper functions for encoding the LSH into the binary Marian model, used by marian-conv void addDummyParameters(Ptr graph, std::string weightsName, int nBits); diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 70e657a93..099ce1808 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -78,7 +78,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { graph_->setDevice(deviceId, device_); #if MKL_FOUND - mkl_set_num_threads(options->get("mkl-threads", 1)); + mkl_set_num_threads(options_->get("mkl-threads", 1)); #endif std::vector models @@ -114,6 +114,9 @@ class BeamSearchDecoder : public IBeamSearchDecoder { for(auto scorer : scorers_) { scorer->init(graph_); } + + // run parameter init once, this is required for graph_->get("parameter name") to work correctly + graph_->forward(); } void setWorkspace(uint8_t* data, size_t size) override { device_->set(data, size); } @@ -121,8 +124,21 @@ class BeamSearchDecoder : public IBeamSearchDecoder { QSNBestBatch decode(const QSBatch& qsBatch, size_t maxLength, const std::unordered_set& shortlist) override { - if(shortlist.size() > 0) { - auto shortListGen = New(shortlist); + + std::vector lshOpts = options_->get>("output-approx-knn", {}); + ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); + ABORT_IF(lshOpts.size() == 2 && shortlist.size() > 0, "LSH and shortlist cannot be used at the same time"); + + if(lshOpts.size() == 2 || shortlist.size() > 0) { + Ptr shortListGen; + // both ShortListGenerators are thin wrappers, hence no problem with calling this per query + if(lshOpts.size() == 2) { + // Setting abortIfDynamic to true disallows memory allocation for LSH parameters, this is specifically for use in Quicksand. + // If we want to use the LSH in Quicksand we need to create a binary model that contains the LSH parameters via conversion. + shortListGen = New(lshOpts[0], lshOpts[1], vocabs_[1]->lemmaSize(), /*abortIfDynamic=*/true); + } else { + shortListGen = New(shortlist); + } for(auto scorer : scorers_) scorer->setShortlistGenerator(shortListGen); } @@ -249,7 +265,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) { // This function converts an fp32 model into an FBGEMM based packed model. // marian defined types are used for external project as well. // The targetPrec is passed as int32_t for the exported function definition. -bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, bool addLsh) { +bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits) { std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl; YAML::Node config; @@ -264,9 +280,10 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP // MJD: Note, this is a default settings which we might want to change or expose. Use this only with Polonium students. // The LSH will not be used by default even if it exists in the model. That has to be enabled in the decoder config. - int lshNBits = 1024; std::string lshOutputWeights = "Wemb"; + bool addLsh = lshNBits > 0; if(addLsh) { + std::cerr << "Adding LSH to model with hash size " << lshNBits << std::endl; // Add dummy parameters for the LSH before the model gets actually initialized. // This create the parameters with useless values in the tensors, but it gives us the memory we need. graph->setReloaded(false); diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h index b710e1352..cddcfd22e 100644 --- a/src/microsoft/quicksand.h +++ b/src/microsoft/quicksand.h @@ -79,7 +79,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name); // MJD: added "addLsh" which will now break whatever compilation after update. That's on purpose. // The calling code should be adapted, not this interface. If you need to fix things in QS because of this // talk to me first! -bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, bool addLsh); +bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits); } // namespace quicksand } // namespace marian diff --git a/src/microsoft/shortlist/utils/Converter.cpp b/src/microsoft/shortlist/utils/Converter.cpp index c28178cd6..df44b338f 100644 --- a/src/microsoft/shortlist/utils/Converter.cpp +++ b/src/microsoft/shortlist/utils/Converter.cpp @@ -1,5 +1,6 @@ #include "microsoft/shortlist/utils/Converter.h" +namespace marian { namespace quicksand { #include "microsoft/shortlist/logging/LoggerMacros.h" @@ -57,3 +58,4 @@ void Converter::HandleConversionError(const std::string& str, const char * type_ } } // namespace quicksand +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/shortlist/utils/Converter.h b/src/microsoft/shortlist/utils/Converter.h index 9d9dd96d6..ecbb5457f 100644 --- a/src/microsoft/shortlist/utils/Converter.h +++ b/src/microsoft/shortlist/utils/Converter.h @@ -5,6 +5,7 @@ #include #include +namespace marian { namespace quicksand { class Converter { @@ -81,3 +82,4 @@ std::vector Converter::ConvertVectorInternal(I begin, I end, const char * typ } } // namespace quicksand +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/shortlist/utils/ParameterTree.cpp b/src/microsoft/shortlist/utils/ParameterTree.cpp index 465d2e0db..b7396b5ed 100644 --- a/src/microsoft/shortlist/utils/ParameterTree.cpp +++ b/src/microsoft/shortlist/utils/ParameterTree.cpp @@ -5,6 +5,7 @@ #include "microsoft/shortlist/utils/StringUtils.h" #include "microsoft/shortlist/utils/Converter.h" +namespace marian { namespace quicksand { #include "microsoft/shortlist/logging/LoggerMacros.h" @@ -414,4 +415,4 @@ void ParameterTree::ReplaceVariablesInternal( } } // namespace quicksand - +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/shortlist/utils/ParameterTree.h b/src/microsoft/shortlist/utils/ParameterTree.h index 1474ff645..e9052f2e9 100644 --- a/src/microsoft/shortlist/utils/ParameterTree.h +++ b/src/microsoft/shortlist/utils/ParameterTree.h @@ -8,6 +8,7 @@ #include "microsoft/shortlist/utils/StringUtils.h" +namespace marian { namespace quicksand { class ParameterTree { @@ -183,3 +184,4 @@ void ParameterTree::SetParam(const std::string& name, const T& obj) { } } // namespace quicksand +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/shortlist/utils/StringUtils.cpp b/src/microsoft/shortlist/utils/StringUtils.cpp index 7870b5422..e4fb88157 100644 --- a/src/microsoft/shortlist/utils/StringUtils.cpp +++ b/src/microsoft/shortlist/utils/StringUtils.cpp @@ -4,6 +4,7 @@ #include #include +namespace marian { namespace quicksand { #include "microsoft/shortlist/logging/LoggerMacros.h" @@ -336,3 +337,4 @@ std::string StringUtils::ToLower(const std::string& str) { } } // namespace quicksand +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/shortlist/utils/StringUtils.h b/src/microsoft/shortlist/utils/StringUtils.h index 31bb1fcc0..be9d1e540 100644 --- a/src/microsoft/shortlist/utils/StringUtils.h +++ b/src/microsoft/shortlist/utils/StringUtils.h @@ -8,6 +8,7 @@ #include "microsoft/shortlist/utils/PrintTypes.h" +namespace marian { namespace quicksand { class StringUtils { @@ -96,3 +97,4 @@ std::string StringUtils::ToString(const T& obj) { } } // namespace quicksand +} // namespace marian \ No newline at end of file diff --git a/src/translator/translator.h b/src/translator/translator.h index 0829f98e6..db1f3d030 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -62,7 +62,7 @@ class Translate : public ModelTask { trgVocab_->load(vocabs.back()); auto srcVocab = corpus_->getVocabs()[0]; - std::vector lshOpts = options_->get>("output-approx-knn"); + std::vector lshOpts = options_->get>("output-approx-knn", {}); ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); if (lshOpts.size() == 2 || options_->hasAndNotEmpty("shortlist")) { From 056c4bef5b99d266f8984fd20b14ab578cd55ee3 Mon Sep 17 00:00:00 2001 From: Rohit Jain Date: Sat, 17 Jul 2021 23:03:16 +0000 Subject: [PATCH 101/254] Merged PR 19860: Case augmented data, if not using factored vocab must not set guided alignments This change allows marking SentenceTuples as 'altered', if they were generated or modified by data augmentation internally in such a way so as to impact processing. In particular, for such sentence tuples, we do not want to try setting guided alignments if the externally provided guided alignments might no longer be correct after that alteration. --- CHANGELOG.md | 1 + src/data/corpus.cpp | 17 ++++++++++++++--- src/data/corpus.h | 2 +- src/data/corpus_base.cpp | 12 +++++++++--- src/data/corpus_base.h | 12 ++++++++++++ 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a0b99272..05658fe10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed +- Do not set guided alignments for case augmented data if vocab is not factored - Various fixes to enable LSH in Quicksand - Added support to MPIWrappest::bcast (and similar) for count of type size_t - Adding new validation metrics when training is restarted and --reset-valid-stalled is used diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index e8ce850b6..d8a364b2e 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -7,6 +7,7 @@ #include "common/filesystem.h" #include "data/corpus.h" +#include "data/factored_vocab.h" namespace marian { namespace data { @@ -26,13 +27,16 @@ Corpus::Corpus(std::vector paths, allCapsEvery_(options_->get("all-caps-every", 0)), titleCaseEvery_(options_->get("english-title-case-every", 0)) {} -void Corpus::preprocessLine(std::string& line, size_t streamId) { +void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { + bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; + altered = false; if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) { line = vocabs_[streamId]->toUpper(line); if (streamId == 0) LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line); else LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line); + altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps } else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) { // Only applied to stream 0 (source) since this feature is aimed at robustness against @@ -43,6 +47,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId) { LOG_ONCE(info, "[data] Source English-title-case'd line to: {}", line); else LOG_ONCE(info, "[data] Target English-title-case'd line to: {}", line); + altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for title casing } } @@ -103,7 +108,10 @@ SentenceTuple Corpus::next() { ++shift; } else { size_t vocabId = j - shift; - preprocessLine(fields[j], vocabId); + bool altered; + preprocessLine(fields[j], vocabId, /*out=*/altered); + if (altered) + tup.markAltered(); addWordsToSentenceTuple(fields[j], vocabId, tup); } } @@ -116,7 +124,10 @@ SentenceTuple Corpus::next() { addWeightsToSentenceTuple(fields[weightFileIdx_], tup); } else { - preprocessLine(line, i); + bool altered; + preprocessLine(line, i, /*out=*/altered); + if (altered) + tup.markAltered(); addWordsToSentenceTuple(line, i, tup); } } diff --git a/src/data/corpus.h b/src/data/corpus.h index 70e7cdfb2..e8e9a9fdb 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -30,7 +30,7 @@ class Corpus : public CorpusBase { // for pre-processing size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target) size_t titleCaseEvery_{0}; // ditto for title case (source only) - void preprocessLine(std::string& line, size_t streamId); + void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian public: // @TODO: check if translate can be replaced by an option in options diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 5be4298be..5f9a9ee36 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -447,9 +447,15 @@ void CorpusBase::addAlignmentsToBatch(Ptr batch, std::vector aligns(srcWords * dimBatch * trgWords, 0.f); for(int b = 0; b < dimBatch; ++b) { - for(auto p : batchVector[b].getAlignment()) { - size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos; - aligns[idx] = 1.f; + + // If the batch vector is altered within marian by, for example, case augmentation, + // the guided alignments we received for this tuple cease to be valid. + // Hence skip setting alignments for that sentence tuple.. + if (!batchVector[b].isAltered()) { + for(auto p : batchVector[b].getAlignment()) { + size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos; + aligns[idx] = 1.f; + } } } batch->setGuidedAlignment(std::move(aligns)); diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 8e5e1334d..251df5bc6 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -28,6 +28,7 @@ class SentenceTuple { std::vector tuple_; // [stream index][step index] std::vector weights_; // [stream index] WordAlignment alignment_; + bool altered_ = false; public: typedef Words value_type; @@ -44,6 +45,17 @@ class SentenceTuple { */ size_t getId() const { return id_; } + /** + * @brief Returns whether this Tuple was altered or augmented from what + * was provided to Marian in input. + */ + bool isAltered() const { return altered_; } + + /** + * @brief Mark that this Tuple was internally altered or augmented by Marian + */ + void markAltered() { altered_ = true; } + /** * @brief Adds a new sentence at the end of the tuple. * From f6cb1b5c6aa7b35d80454a7fd01301e097945a3a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 21 Jul 2021 00:12:02 +0000 Subject: [PATCH 102/254] Merged PR 19864: add bias if it exists Fixes backcompat with shortlist and bias. --- src/data/shortlist.h | 3 +++ src/layers/output.cpp | 32 +++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 6cfb650de..82b0df69a 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -43,6 +43,7 @@ class Shortlist { Shortlist(const std::vector& indices); virtual ~Shortlist(); + virtual bool isDynamic() const { return false; } virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const; virtual WordIndex tryForwardMap(WordIndex wIdx) const; @@ -87,6 +88,8 @@ class LSHShortlist: public Shortlist { public: LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic = false); + + virtual bool isDynamic() const override { return true; } virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const override; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; diff --git a/src/layers/output.cpp b/src/layers/output.cpp index d7ba4490a..8fe5096a5 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -59,7 +59,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { /* std::cerr << "affineOrDot.x=" << x->shape() << std::endl; std::cerr << "affineOrDot.W=" << W->shape() << std::endl; - std::cerr << "affineOrDot.b=" << b->shape() << std::endl; + if (b) std::cerr << "affineShortlist.b=" << b->shape() << std::endl; std::cerr << "affineOrDot.transA=" << transA << " transB=" << transB << std::endl; */ if(b) @@ -68,18 +68,32 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { return dot(x, W, transA, transB); }; - auto affineShortlist = [](Expr x, Expr W, Expr b, bool transA, bool transB) { - /* + auto affineShortlist = [this](Expr x, Expr W, Expr b, bool transA, bool transB) { + /* std::cerr << "affineShortlist.x=" << x->shape() << std::endl; std::cerr << "affineShortlist.W=" << W->shape() << std::endl; - std::cerr << "affineShortlist.b=" << b->shape() << std::endl; + if (b) std::cerr << "affineShortlist.b=" << b->shape() << std::endl; std::cerr << "affineShortlist.transA=" << transA << " transB=" << transB << std::endl; */ - ABORT_IF(!(!transA && transB), "affineShortlist. Must be transA==0 and transB==1"); - ABORT_IF(b, "affineShortlist not tested with bias"); - Expr ret = bdot(x, W, transA, transB); - //std::cerr << "ret=" << ret->shape() << std::endl; - //std::cerr << std::endl; + + Expr ret; + + if (b) { + // original shortlist. W always has 1 for beam & batch + ABORT_UNLESS(!shortlist_->isDynamic(), "affineShortlist. Bias not supported with LSH/dynamic shortlist"); // todo rename ABORT_UNLESS to ASSERT + ret = affine(x, W, b, transA, transB); + } + else if (shortlist_->isDynamic()) { + // LSH produces W entry for each beam and batch => need bdot() + ABORT_IF(!(!transA && transB), "affineShortlist. Only tested with transA==0 and transB==1"); + ret = bdot(x, W, transA, transB); + } + else { + // original shortlist. W always has 1 for beam & batch + ret = dot(x, W, transA, transB); + } + + //std::cerr << "ret.x=" << ret->shape() << std::endl; return ret; }; From b83b06fb73e6f8ec8bcfcb6b3781c749cce83024 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 22 Jul 2021 16:36:43 +0000 Subject: [PATCH 103/254] Merged PR 19914: Fix Windows Azure Pipelines Updating vcpkg seems to fix the recent issues with Windows builds in Azure Pipelines. --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index dfed6ab40..a11988189 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,7 @@ variables: CUDA_PATH_WINDOWS: "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA" MKL_DIR: "$(Build.SourcesDirectory)/mkl" MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" - VCPKG_COMMIT: 6185aa76504a5025f36754324abf307cc776f3da + VCPKG_COMMIT: c69096659f49e2b1aca532ea5c2f8c135182519b VCPKG_DIR: "$(Build.SourcesDirectory)/vcpkg" VCPKG_PACKAGES: "protobuf" # The Visual Studio installation directory can be found using: @@ -88,8 +88,8 @@ stages: # Install packages .\vcpkg.exe install --triplet x64-windows-static $(VCPKG_PACKAGES) # Clean to make the cache smaller - Remove-Item $(VCPKG_DIR)\downloads -Force -Recurse - Remove-Item $(VCPKG_DIR)\buildtrees -Force -Recurse + Remove-Item $(VCPKG_DIR)\downloads -Force -Recurse -ErrorAction SilentlyContinue + Remove-Item $(VCPKG_DIR)\buildtrees -Force -Recurse -ErrorAction SilentlyContinue displayName: Prepare vcpkg - script: | From 6b568f4afa44b5bd7c9e335856b977fc054f343c Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 22 Jul 2021 16:44:35 +0000 Subject: [PATCH 104/254] Merged PR 19904: Update instructions for building on Windows Changes proposed in this pull request: 1. Clarified instructions how to build Marian on Windows in `vs/README.md`. 2. `vs/CheckOrInstallDeps.bat` does not stop if CUDA or MKL are not installed. --- vs/CheckOrInstallDeps.bat | 62 ++++++++------ vs/NOTES.md | 31 ++++++- vs/README.md | 174 +++++++++++++------------------------- 3 files changed, 123 insertions(+), 144 deletions(-) diff --git a/vs/CheckOrInstallDeps.bat b/vs/CheckOrInstallDeps.bat index 2fd1f5b3a..65c5e252a 100644 --- a/vs/CheckOrInstallDeps.bat +++ b/vs/CheckOrInstallDeps.bat @@ -2,7 +2,7 @@ :: Usage: CheckOrInstallDeps.bat :: :: This script is used to verify that all the dependencies required to build Marian are available. -:: The Cuda SDK and the Intel MKL must be installed beforehand by the user. +:: The CUDA SDK and the Intel MKL must be installed beforehand by the user. :: The rest of libraries (see README.md), if not found, will be installed by this script using :: vcpkg. :: @@ -96,6 +96,9 @@ echo. echo --- Checking dependencies... set CMAKE_OPT= +set FOUND_CUDA= +set FOUND_MKL= +set FOUND_BOOST= :: ------------------------- @@ -105,8 +108,9 @@ echo. echo ... CUDA if "%CUDA_PATH%"=="" ( echo The CUDA_PATH environment variable is not defined: this will compile only the CPU version. + set "FOUND_CUDA=false" ) else ( - echo Found Cuda SDK in %CUDA_PATH% + echo Found Cuda SDK in "%CUDA_PATH%" ) :: ------------------------- @@ -119,26 +123,30 @@ echo ... Intel MKL if "%MKLROOT%" == "" ( set "MKLROOT=C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl" ) + if not exist "%MKLROOT%" ( echo MKLROOT is set to a non existing path: - echo "%MKLROOT%" + echo "%MKLROOT%" echo Please make sure the Intel MKL libraries are installed and set MKLROOT to the installation path. - exit /b 1 -) -if not exist "%MKLROOT%\include\mkl_version.h" ( + set "FOUND_MKL=false" +) else if not exist "%MKLROOT%\include\mkl_version.h" ( echo MKL header files were not found in this folder: - echo "%MKLROOT%" + echo "%MKLROOT%\include" echo Please make sure Intel MKL is properly installed. - exit /b 1 -) -if not exist "%MKLROOT%\lib\intel64\mkl_core.lib" ( + set "FOUND_MKL=false" +) else if not exist "%MKLROOT%\lib\intel64\mkl_core.lib" ( echo MKL library files were not found in this folder: - echo "%MKLROOT%" + echo "%MKLROOT%\lib\intel64" echo Please make sure Intel MKL is properly installed. - exit /b 1 + set "FOUND_MKL=false" +) else ( + echo Found Intel MKL library in "%MKLROOT%" ) -echo Found Intel MKL library in %MKLROOT% +if "%FOUND_MKL%" == "false" if "%FOUND_CUDA%" == "false" ( + echo. + echo Error: neither CUDA SDK nor Intel MKL were found, but at least one of them must be installed. +) :: ------------------------- :: BOOST_INCLUDEDIR and BOOST_LIBRARYDIR can be both set to an existing Boost installation. @@ -156,29 +164,31 @@ if not exist "%BOOST_INCLUDEDIR%" ( echo BOOST_INCLUDEDIR is set to a non existing path: echo "%BOOST_INCLUDEDIR%" echo Please set BOOST_INCLUDEDIR and BOOST_LIBRARYDIR to the installation path of the Boost library. - exit /b 1 -) -if not exist "%BOOST_INCLUDEDIR%\boost\version.hpp" ( + set "FOUND_BOOST=false" +) else if not exist "%BOOST_INCLUDEDIR%\boost\version.hpp" ( echo Boost header files were not found in this folder: - echo "%BOOST_INCLUDEDIR%" + echo "%BOOST_INCLUDEDIR%\boost" echo Please make sure Boost is correctly installed. - exit /b 1 -) - -if not exist "%BOOST_LIBRARYDIR%" ( + set "FOUND_BOOST=false" +) else if not exist "%BOOST_LIBRARYDIR%" ( echo BOOST_LIBRARYDIR is set to a non existing path: echo "%BOOST_LIBRARYDIR%" echo Please set BOOST_INCLUDEDIR and BOOST_LIBRARYDIR to the installation path of the Boost library. - exit /b 1 -) -if not exist "%BOOST_LIBRARYDIR%\boost_*.lib" ( + set "FOUND_BOOST=false" +) else if not exist "%BOOST_LIBRARYDIR%\boost_*.lib" ( echo Boost library files were not found in this folder: echo "%BOOST_LIBRARYDIR%" echo Please make sure Boost is correctly installed. - exit /b 1 + set "FOUND_BOOST=false" +) else ( + echo Found Boost headers in "%BOOST_INCLUDEDIR%" and libs in "%BOOST_LIBRARYDIR%" +) + +if "%FOUND_BOOST%" == "false" ( + echo. + echo Warning: Boost was not found. marian-server will not be compiled. ) -echo Found Boost headers in "%BOOST_INCLUDEDIR%" and libs in "%BOOST_LIBRARYDIR%" :: ------------------------- :: OPENSSL_ROOT_DIR can be set to an existing OpenSSL installation. diff --git a/vs/NOTES.md b/vs/NOTES.md index 4d00fab50..fb6195e6e 100644 --- a/vs/NOTES.md +++ b/vs/NOTES.md @@ -1,6 +1,35 @@ # How to build Marian on Windows with GPU support -This is interesting for developers, exctracted from README. +This is interesting mostly for developers. Warning: it has been extracted from +an old `vs/README.md` and some information might be outdated. + +--- +## Known issues + +1. __Patch for CUDA 9.2 error: Unsupported Visual Studio Version Error__ + + When using CUDA 9.2, the latest versions of Visual Studio 2017 are not + officially supported by CUDA. Two fixes are proposed: + - Downgrade Visual Studio to a supported version + - Edit the file `\include\crt\host_config.h` and change the line 131: + + 131 #if _MSC_VER < 1600 || _MSC_VER > 1914 + + into: + + 131 #if _MSC_VER < 1600 || _MSC_VER > 1915 + + For more information, read this [nVidia forum](https://devtalk.nvidia.com/default/topic/1022648/cuda-setup-and-installation/cuda-9-unsupported-visual-studio-version-error/4) + +2. __It does not compile with Boost 1.73 or newer__ + + It may happen that SimpleWebSocketServer, a 3rd party library that Marian uses for + marian-server, does not support the version of Boost available in vcpkg. In such case install a + supported version of Boost; if you use vcpkg, an option is to checkout to #5970385, which has + Boost 1.72. + + Note that Boost is required only if you compile with marian-server, for compilation using CMake, + it is if you set `COMPILE_SERVER` to `TRUE` in CMakeSettings.json. --- ## Changes from the master branch diff --git a/vs/README.md b/vs/README.md index c445f99fb..e3a5ee35d 100644 --- a/vs/README.md +++ b/vs/README.md @@ -1,34 +1,34 @@ -# How to build Marian on Windows with GPU support +# Building Marian on Windows ## Install prerequisites -The following SDK are required to build Marian with GPU support. At least one of them needs to be -installed. If only CUDA is installed but not MKL, a GPU-only version will be build. If only MKL is -installed and not CUDA, only the CPU version will be built. So if you are interested in only one -functionality, you can omit one of them. Install both for full functionality. +At least one of the following SDK is required to build Marian on Windows: - [CUDA](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal), - Base installer, CUDA 10.0+ is recommended, there might be issues with CUDA 9.2, see below + Base installer, CUDA 10.0+ is recommended, there might be issues with CUDA + 9.2, see below - [Intel MKL](https://software.intel.com/en-us/mkl) +CUDA is required for running Marian on GPU, and Intel MKL for CPU. If only one +of them is installed, a GPU-only or CPU-only version can be built. + --- ## Check dependencies : `CheckOrInstallDeps.bat` -In addition to the 2 previous prerequisites, Marian may need the following libraries that you may -already have on your system: +The script `CheckOrInstallDeps.bat` can be used to verify that all dependencies +are found on your system. If not, it will use the `vcpkg` library manager to +download and manage your dependencies for CMake, including the following +optional libraries needed only if you want to compile Marian server: - - Boost (1.58-1.72), optional for marian-server (`COMPILE_SERVER=TRUE` in CMake) + - Boost (1.58-1.72), optional for marian-server (`COMPILE_SERVER=TRUE` in + `CMakeSettings.json`) - OpenSSL, optional for marian-server -The script `CheckOrInstallDeps.bat` can be used to verify that all dependencies are found on your -system. If not, it will use the `vcpkg` library manager to download and manage your dependencies for -CMake. - -If you already have a working `vcpkg` installation, this script can use it. -If vcpkg is in your `PATH` environment variable, the script will find it and use it automatically. -Otherwise, you need to edit the script and set the `VCPKG_ROOT` variable. -Please see the script for more details. +If you already have a working `vcpkg` installation, this script can use it. If +vcpkg is in your `PATH` environment variable, the script will find it and use +it automatically. Otherwise, you need to edit the script and set the +`VCPKG_ROOT` variable. Please see the script for more details. --- ## Build the project @@ -41,119 +41,59 @@ There are 3 alternatives to build the project: ### 1. Use VS2017+ with built-in support for CMake -VS2017 or newer now allows to develop projects built with CMake without the need to generate VS -projects and solutions. For more information, please read [this article](https://blogs.msdn.microsoft.com/vcblog/2016/10/05/cmake-support-in-visual-studio/) +VS2017 or newer now allows to develop projects built with CMake without the +need to generate VS projects and solutions. For more information, please read +[this article](https://blogs.msdn.microsoft.com/vcblog/2016/10/05/cmake-support-in-visual-studio/) from the Visual C++ Team. -You just need to open the root folder of the git repository in VS (which contains the file -`CMakeSettings.json`): - -- In an Explorer window, right-click then `Open in Visual Studio` -- In a VS2017 instance, `File > Open > Folder...` - -You may need to edit the file `CMakeSettings.json` to set the environment variable for the -dependencies. - -The developing experience is very similar that when using a solution file (Intellisense, build -project with `F7`, debug, set breakpoints and watch variables, ...), except that the project -configuration is done in 3 different files: - - - `CMakeList.txt`: this is the CMake source file from the original project. - It is used to configure the build targets, add/remove files to compile and configure the - compiler flags. - - - `CMakeSettings.json`: this file is required to enable CMake integration in VS2017. - Use this file to configure the environment variables and the parameters passed to CMake to - generate the project. - - - `.vs\launch.vs.json`: this is an optional user specific file and it is not commited in the Git - repo. Use this file to configure the debugging targets. For example: - - { - "version": "0.2.1", - "defaults": {}, - "configurations": [ - { - "type": "default", - "name": "Training Basics", - "project": "CMakeLists.txt", - "projectTarget": "marian.exe", - "currentDir": "D:\\Perso\\github\\marian\\marian-examples\\training-basics", - "args": [ - "--devices 0", - "--type amun", - "--model model/model.npz", - "--train-sets data/corpus.bpe.ro data/corpus.bpe.en", - "--vocabs model/vocab.ro.yml model/vocab.en.yml", - "--dim-vocabs 66000 50000", - "--mini-batch-fit", - "-w 3000", - "--layer-normalization", - "--dropout-rnn 0.2", - "--dropout-src 0.1", - "--dropout-trg 0.1", - "--early-stopping 5", - "--valid-freq 100", - "--save-freq 10000", - "--disp-freq 100", - "--valid-metrics cross-entropy translation", - "--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en", - "--valid-script-path .\\scripts\\validate.bat", - "--log model/train.log", - "--valid-log model/valid.log", - "--overwrite", - "--keep-best", - "--seed 1111", - "--exponential-smoothing", - "--normalize 1", - "--beam-size 12", - "--quiet-translation" - ] - } - ] - } - - -### 2. Create solution and projects files for Visual Studio : `CreateVSProjects.bat` - -If you have a previous version of Visual Studio, you will need to use CMake to generate the projects -files. +1. Open the root folder of the git repository in VS (which contains the file + `CMakeSettings.json`) using `Open local folder` on the welcome page or `File + > Open > Folder...` in a VS instance. +2. Edit the file `CMakeSettings.json` to set the environment variable for the + dependencies. Set `COMPILE_CUDA` or `COMPILE_CPU` to `FALSE` if you wish to + compile a CPU-only or a GPU-only version respectively. +3. VS2017 should automatically detect `CMakeSettings.json` and generate CMake + Cache. +4. Build the project with `F7`. If build is successful, the executables will be + in the `build` folder. -The provided script `CreateVSProjects.bat` runs the dependency checks then invokes CMake with the -right parameters to create the solutions for Visual Studio. +#### Development -### 3. Use MSBuild : `BuildRelease.bat` +The developing experience is very similar that when using a solution file +(Intellisense, build project with `F7`, debug, set breakpoints and watch +variables, ...), except that the project configuration is done in 3 different +files: -The last alternative is to use the script `BuildRelease.bat` that will: -- Check the dependencies -- Create the VS project files -- Invoke MSBuild on these projects to build the targets in Release. +- `CMakeList.txt`: this is the CMake source file from the original project. + It is used to configure the build targets, add/remove files to compile and configure the + compiler flags. ---- -## Known issues +- `CMakeSettings.json`: this file is required to enable CMake integration in VS2017. + Use this file to configure the environment variables and the parameters passed to CMake to + generate the project. -1. __Patch for CUDA 9.2 error: Unsupported Visual Studio Version Error__ +- `.vs\launch.vs.json`: this is an optional user specific file and it is not commited in the Git + repo. Use this file to configure the debugging targets. - When using CUDA 9.2, the latest versions of Visual Studio 2017 are not officially supported by - CUDA. Two fixes are proposed: - - Downgrade Visual Studio to a supported version - - Edit the file `\include\crt\host_config.h` and change the line 131: - 131 #if _MSC_VER < 1600 || _MSC_VER > 1914 +### 2. Create solution and projects files for Visual Studio : `CreateVSProjects.bat` - into: +If you have a previous version of Visual Studio, you will need to use CMake to +generate the projects +files. - 131 #if _MSC_VER < 1600 || _MSC_VER > 1915 +The provided script `CreateVSProjects.bat` runs the dependency checks then +invokes CMake with the right parameters to create the solutions for Visual +Studio. - For more information, read this [nVidia forum](https://devtalk.nvidia.com/default/topic/1022648/cuda-setup-and-installation/cuda-9-unsupported-visual-studio-version-error/4) +Warning: the Visual Studio Solution file included in the `vs/` folder might not +work out of the box with your environment and require customization. -2. __It does not compile with Boost 1.73 or newer__ - It may happen that SimpleWebSocketServer, a 3rd party library that Marian uses for - marian-server, does not support the version of Boost available in vcpkg. In such case install a - supported version of Boost; if you use vcpkg, an option is to checkout to #5970385, which has - Boost 1.72. +### 3. Use MSBuild : `BuildRelease.bat` - Note that Boost is required only if you compile with marian-server, for compilation using CMake, - it is if you set `COMPILE_SERVER` to `TRUE` in CMakeSettings.json. +The last alternative is to use the script `BuildRelease.bat` that will: +- Check the dependencies. +- Create the VS project files. +- Invoke MSBuild on these projects to build the targets in Release. From b653db0a9b5fc37b87d60572465724af80717805 Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Thu, 22 Jul 2021 21:00:44 +0000 Subject: [PATCH 105/254] Merged PR 19910: Fix training/scoring error with FSM Fixes a dimension mismatch during training and scoring introduced in the decoding-only shortlist changes. Related work items: #122643 --- src/layers/output.cpp | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 8fe5096a5..92cccdfb6 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -273,27 +273,27 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } #endif // re-embedding lookup, soft-indexed by softmax - Expr cachedShortLemmaEt; + Expr e; if(shortlist_) { // short-listed version of re-embedding matrix - cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); + Expr cachedShortLemmaEt = shortlist_->getCachedShortLemmaEt(); + // std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + // std::cerr << "cachedShortLemmaEt=" << cachedShortLemmaEt->shape() << std::endl; + const Shape &fShape = factorSoftmax->shape(); + ABORT_IF(fShape[1] != 1, "We are decoding with a shortlist but time step size {} != 1??", fShape[1]); + factorSoftmax = reshape(factorSoftmax, {fShape[0], fShape[2], 1, fShape[3]}); // we can switch dims because time step is of size 1 + // std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; + e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); + // std::cerr << "e.1=" << e->shape() << std::endl; + const Shape &eShape = e->shape(); + e = reshape(e, {eShape[0], 1, eShape[1], eShape[3]}); // switch dims back, again possible because time step is of size 1 + // std::cerr << "e.2=" << e->shape() << std::endl; + // std::cerr << std::endl; + } else { // for scoring, training and decoding without a shortlist we use a simple dot operation + e = dot(factorSoftmax, + lemmaEt_, + false, + true); // [B... x L] } - else { - const Shape &s = lemmaEt_->shape(); - //std::cerr << "lemmaEt_=" << lemmaEt_->shape() << std::endl; - cachedShortLemmaEt = reshape(lemmaEt_, {1, 1, s[0], s[1]}); - } - //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - //std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; - factorSoftmax = transpose(factorSoftmax, {0, 2, 1, 3}); - //std::cerr << "factorSoftmax=" << factorSoftmax->shape() << std::endl; - //std::cerr << "cachedShortLemmaEt.2=" << cachedShortLemmaEt->shape() << std::endl; - - Expr e = bdot(factorSoftmax, cachedShortLemmaEt, false, true); - //std::cerr << "e.1=" << e->shape() << std::endl; - const Shape &eShape = e->shape(); - e = reshape(e, {eShape[0], 1, eShape[1], eShape[3]}); - //std::cerr << "e.3=" << e->shape() << std::endl; - //std::cerr << std::endl; // project it back to regular hidden dim int inputDim = input1->shape()[-1]; From 4ff2ef189e3d6cca8ab2169f15f1f26739bb7884 Mon Sep 17 00:00:00 2001 From: Rohit Jain Date: Fri, 30 Jul 2021 03:28:00 +0000 Subject: [PATCH 106/254] Merged PR 19761: Expose SPM Interface from Marian This PR adds interfaces in Marian to allow it to handle segmentation duties. Related work items: #121418 --- CMakeLists.txt | 9 +- src/3rd_party/CMakeLists.txt | 17 +- src/CMakeLists.txt | 4 + src/microsoft/sentencepiece.cpp | 169 +++++++++++++++++ src/microsoft/sentencepiece.h | 38 ++++ src/microsoft/unicode_conversions.h | 282 ++++++++++++++++++++++++++++ 6 files changed, 505 insertions(+), 14 deletions(-) create mode 100644 src/microsoft/sentencepiece.cpp create mode 100644 src/microsoft/sentencepiece.h create mode 100644 src/microsoft/unicode_conversions.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 119bc01f1..870fb70be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,11 +81,16 @@ if(MSVC) # These are used in src/CMakeLists.txt on a per-target basis list(APPEND ALL_WARNINGS /WX; /W4;) - # Disabled bogus warnings for CPU intrinsics: + # Disabled bogus warnings for CPU intrinsics and Protobuf: + # C4100: 'identifier' : unreferenced formal parameter # C4310: cast truncates constant value # C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier # C4702: unreachable code; note it is also disabled globally in the VS project file - set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"") + if(USE_SENTENCEPIECE) + set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4100\"") + else() + set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"") + endif() # set(INTRINSICS "/arch:AVX") add_definitions(-DUSE_SSE2=1) diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 2bef31296..c21868e33 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -71,9 +71,7 @@ if(USE_SENTENCEPIECE) endif() # regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically - if(NOT GENERATE_MARIAN_INSTALL_TARGETS) - set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE) - endif() + set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE) set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.") if(USE_STATIC_LIBS) @@ -111,16 +109,11 @@ if(USE_SENTENCEPIECE) set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) endif() + # regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically if(GENERATE_MARIAN_INSTALL_TARGETS) - if(USE_STATIC_LIBS) - install(TARGETS sentencepiece-static sentencepiece_train-static - EXPORT marian-targets - DESTINATION sentencepiece) - else() - install(TARGETS sentencepiece sentencepiece_train - EXPORT marian-targets - DESTINATION sentencepiece) - endif() + install(TARGETS sentencepiece-static sentencepiece_train-static + EXPORT marian-targets + DESTINATION sentencepiece) endif(GENERATE_MARIAN_INSTALL_TARGETS) endif(USE_SENTENCEPIECE) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1f5db423f..e4599c407 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,9 @@ include_directories(.) include_directories(3rd_party) include_directories(3rd_party/SQLiteCpp/include) include_directories(3rd_party/sentencepiece) +if(USE_SENTENCEPIECE) + include_directories(3rd_party/sentencepiece/third_party/protobuf-lite) +endif(USE_SENTENCEPIECE) include_directories(3rd_party/fbgemm/include) include_directories(3rd_party/intgemm) include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory. @@ -110,6 +113,7 @@ set(MARIAN_SOURCES # this is only compiled to catch build errors microsoft/quicksand.cpp + microsoft/sentencepiece.cpp microsoft/cosmos.cpp # copied from quicksand to be able to read binary shortlist diff --git a/src/microsoft/sentencepiece.cpp b/src/microsoft/sentencepiece.cpp new file mode 100644 index 000000000..d1b02b673 --- /dev/null +++ b/src/microsoft/sentencepiece.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include + +#ifdef USE_SENTENCEPIECE +#include "sentencepiece.h" + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsuggest-override" +#endif + +#include "sentencepiece/src/builtin_pb/sentencepiece.pb.h" + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +#include "sentencepiece/src/sentencepiece_processor.h" +#include "sentencepiece/src/sentencepiece_trainer.h" +#include "unicode_conversions.h" + +namespace marian { +namespace spm { +class SentencePieceInternal { + std::unique_ptr m_processor; + + void checkStatus(sentencepiece::util::Status status, const char* what) { + if(status.ok()) + return; + std::string err = status.ToString(); + std::cerr << err << std::endl; + throw std::runtime_error(std::string("SentencePiece error ") + what + ": " + err); + } + + int createNativeSentencePieceText(sentencepiece::SentencePieceText& spt, Native_SentencePieceText** outSpt) { + Native_SentencePieceText* spt_ret = new Native_SentencePieceText(); + + spt_ret->text = new char[spt.text().size() + 1]; + ::strcpy(spt_ret->text, spt.text().c_str()); + + spt_ret->num_pieces = spt.pieces().size(); + spt_ret->pieces = new Native_SentencePiecePiece*[spt_ret->num_pieces]; + + int counter = 0; + for(auto& piece : spt.pieces()) { + spt_ret->pieces[counter] = new Native_SentencePiecePiece(); + spt_ret->pieces[counter]->id = piece.id(); + spt_ret->pieces[counter]->begin = piece.begin(); + spt_ret->pieces[counter]->end = piece.end(); + spt_ret->pieces[counter]->surface = new char[piece.surface().size() + 1]; + ::strcpy((spt_ret->pieces)[counter]->surface, (char*)piece.surface().c_str()); + spt_ret->pieces[counter]->piece = new char[piece.piece().size() + 1]; + ::strcpy((spt_ret->pieces)[counter]->piece, (char*)piece.piece().c_str()); + counter++; + } + *outSpt = spt_ret; + return 0; + } + +public: + + SentencePieceInternal(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize) { + m_processor.reset(new sentencepiece::SentencePieceProcessor()); + // load the model file + const auto status = m_processor->Load(utf16_to_utf8(utf16string(modelPath))); + // implant the restricted vocabulary, if given + if(vocab && vocabSize > 0) { + std::vector vocab_str; + for(size_t i = 0; i < vocabSize; i++) + vocab_str.push_back(utf16_to_utf8(utf16string(vocab[i]))); + + m_processor->SetVocabulary(vocab_str); + } + checkStatus(status, "loading"); + } + + int getPieceID(char* sentence) { + std::string sentInUtf8(sentence); + return m_processor->PieceToId(absl::string_view(sentInUtf8)); + } + + int encodeAligned(char* sentence, Native_SentencePieceText** nSpt) { + sentencepiece::SentencePieceText spt; + std::string sentInUtf8(sentence); + m_processor->Encode(absl::string_view(sentInUtf8), &spt); + + return createNativeSentencePieceText(spt, nSpt); + } + + int decodeAligned(int num_tokens, char** inp_tokens, Native_SentencePieceText** nSpt) { + sentencepiece::SentencePieceText spt; + std::vector tokens; + for(int i = 0; i < num_tokens; i++) { + std::string tok((char*)inp_tokens[i]); + tokens.push_back(tok); + } + m_processor->Decode(tokens, &spt); + return createNativeSentencePieceText(spt, nSpt); + } +}; + +int SentencePieceInteropFreeNativeSentencePieceText(Native_SentencePieceText* spt) { + auto num_pieces = (*spt).num_pieces; + for(int i = 0; i < num_pieces; i++) { + Native_SentencePiecePiece* piece = (*spt).pieces[i]; + delete(piece->surface); + delete(piece->piece); + delete(piece); + } + delete[]((*spt).pieces); + delete[]((*spt).text); + delete(spt); + spt = NULL; + return 0; +} + +intptr_t SentencePieceInteropLoadModel(const uint16_t* modelPath, + const uint16_t** vocab, + size_t vocabSize) { + try { + return (intptr_t) new SentencePieceInternal(modelPath, vocab, vocabSize); + } + catch(...) { return (intptr_t) nullptr; } +} + +int SentencePieceInteropDecodeAligned(intptr_t object, + int num_tokens, + char** tokens, + Native_SentencePieceText** nSpt) { + try { + return ((SentencePieceInternal*)object)->decodeAligned(num_tokens, tokens, nSpt); + } + catch(...) { return -1; } +} + +int SentencePieceInteropEncodeAligned(intptr_t object, + char* word, + Native_SentencePieceText** nSpt) { + try { + return ((SentencePieceInternal*)object)->encodeAligned(word, nSpt); + } + catch(...) { return -1; } +} + +int SentencePieceInteropGetPieceID(intptr_t object, char* word) { + try { + return ((SentencePieceInternal*)object)->getPieceID(word); + } + catch(...) { return -1; } +} + +int SentencePieceInteropUnloadModel(intptr_t object) { + delete(SentencePieceInternal*)object; + return 0; +} + +int SentencepieceInteropTrainModel(char* args) { + std::stringstream command; + command << std::string(args); + auto status = sentencepiece::SentencePieceTrainer::Train(command.str()); + return (int)status.code(); +} + +} // namespace spm +} // namespace marian + +#endif \ No newline at end of file diff --git a/src/microsoft/sentencepiece.h b/src/microsoft/sentencepiece.h new file mode 100644 index 000000000..afd0048c4 --- /dev/null +++ b/src/microsoft/sentencepiece.h @@ -0,0 +1,38 @@ +#pragma once +#include + +namespace marian { +namespace spm { + +// Describes an individual token in a sentencepiece encoding +struct Native_SentencePiecePiece { + int id; + int begin; + int end; + char* surface; + char* piece; +}; + +// Mirrors the SentencePieceText protobuf struct returned by SPM +// and provides individual piece and corresponding surface details +struct Native_SentencePieceText { + char* text; + int num_pieces; + Native_SentencePiecePiece** pieces; +}; + +int SentencePieceInteropFreeNativeSentencePieceText(Native_SentencePieceText* spt); +intptr_t SentencePieceInteropLoadModel(const uint16_t* modelPath, + const uint16_t** vocab, + size_t vocabSize); +int SentencePieceInteropDecodeAligned(intptr_t object, + int num_tokens, + char** tokens, + Native_SentencePieceText** nSpt); +int SentencePieceInteropEncodeAligned(intptr_t object, char* word, Native_SentencePieceText** nSpt); +int SentencePieceInteropGetPieceID(intptr_t object, char* word); +int SentencePieceInteropUnloadModel(intptr_t object); +int SentencepieceInteropTrainModel(char* args); + +} // namespace spm +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/unicode_conversions.h b/src/microsoft/unicode_conversions.h new file mode 100644 index 000000000..3b9c09dc9 --- /dev/null +++ b/src/microsoft/unicode_conversions.h @@ -0,0 +1,282 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// This was extracted from https://github.com/microsoft/cpprestsdk/blob/cdae258bfb22f948c7b768b4dc56f5f4a2d9b2ce/Release/src/utilities/asyncrt_utils.cpp#L305 + +#include +#include + +typedef std::basic_string utf16string; + +#define LOW_3BITS 0x7 +#define LOW_4BITS 0xF +#define LOW_5BITS 0x1F +#define LOW_6BITS 0x3F +#define BIT4 0x8 +#define BIT5 0x10 +#define BIT6 0x20 +#define BIT7 0x40 +#define BIT8 0x80 +#define L_SURROGATE_START 0xDC00 +#define L_SURROGATE_END 0xDFFF +#define H_SURROGATE_START 0xD800 +#define H_SURROGATE_END 0xDBFF +#define SURROGATE_PAIR_START 0x10000 + +// Create a dedicated type for characters to avoid the issue +// of different platforms defaulting char to be either signed +// or unsigned. +using UtilCharInternal_t = signed char; + +inline size_t count_utf8_to_utf16(const std::string& s) +{ + const size_t sSize = s.size(); + auto const sData = reinterpret_cast(s.data()); + size_t result {sSize}; + + for (size_t index = 0; index < sSize;) + { + if (sData[index] >= 0) + { + // use fast inner loop to skip single byte code points (which are + // expected to be the most frequent) + while ((++index < sSize) && (sData[index] >= 0)) + ; + + if (index >= sSize) break; + } + + // start special handling for multi-byte code points + const UtilCharInternal_t c {sData[index++]}; + + if ((c & BIT7) == 0) + { + throw std::range_error("UTF-8 string character can never start with 10xxxxxx"); + } + else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF + { + if (index == sSize) + { + throw std::range_error("UTF-8 string is missing bytes in character"); + } + + const UtilCharInternal_t c2 {sData[index++]}; + if ((c2 & 0xC0) != BIT8) + { + throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); + } + + // can't require surrogates for 7FF + --result; + } + else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF + { + if (sSize - index < 2) + { + throw std::range_error("UTF-8 string is missing bytes in character"); + } + + const UtilCharInternal_t c2 {sData[index++]}; + const UtilCharInternal_t c3 {sData[index++]}; + if (((c2 | c3) & 0xC0) != BIT8) + { + throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); + } + + result -= 2; + } + else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF + { + if (sSize - index < 3) + { + throw std::range_error("UTF-8 string is missing bytes in character"); + } + + const UtilCharInternal_t c2 {sData[index++]}; + const UtilCharInternal_t c3 {sData[index++]}; + const UtilCharInternal_t c4 {sData[index++]}; + if (((c2 | c3 | c4) & 0xC0) != BIT8) + { + throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); + } + + const uint32_t codePoint = + ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); + result -= (3 - (codePoint >= SURROGATE_PAIR_START)); + } + else + { + throw std::range_error("UTF-8 string has invalid Unicode code point"); + } + } + + return result; +} + +utf16string /*__cdecl conversions::*/utf8_to_utf16(const std::string& s) +{ + // Save repeated heap allocations, use the length of resulting sequence. + const size_t srcSize = s.size(); + auto const srcData = reinterpret_cast(s.data()); + utf16string dest(count_utf8_to_utf16(s), L'\0'); + utf16string::value_type* const destData = &dest[0]; + size_t destIndex = 0; + + for (size_t index = 0; index < srcSize; ++index) + { + UtilCharInternal_t src = srcData[index]; + switch (src & 0xF0) + { + case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF + { + const UtilCharInternal_t c2 {srcData[++index]}; + const UtilCharInternal_t c3 {srcData[++index]}; + const UtilCharInternal_t c4 {srcData[++index]}; + uint32_t codePoint = + ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); + if (codePoint >= SURROGATE_PAIR_START) + { + // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs. + // - 0x10000 is subtracted from the code point + // - high surrogate is 0xD800 added to the top ten bits + // - low surrogate is 0xDC00 added to the low ten bits + codePoint -= SURROGATE_PAIR_START; + destData[destIndex++] = static_cast((codePoint >> 10) | H_SURROGATE_START); + destData[destIndex++] = + static_cast((codePoint & 0x3FF) | L_SURROGATE_START); + } + else + { + // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point + // value. U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present + // but will encode them if encountered. + destData[destIndex++] = static_cast(codePoint); + } + } + break; + case 0xE0: // 3 byte character, 0x800 to 0xFFFF + { + const UtilCharInternal_t c2 {srcData[++index]}; + const UtilCharInternal_t c3 {srcData[++index]}; + destData[destIndex++] = static_cast( + ((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS)); + } + break; + case 0xD0: // 2 byte character, 0x80 to 0x7FF + case 0xC0: + { + const UtilCharInternal_t c2 {srcData[++index]}; + destData[destIndex++] = + static_cast(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS)); + } + break; + default: // single byte character, 0x0 to 0x7F + // try to use a fast inner loop for following single byte characters, + // since they are quite probable + do + { + destData[destIndex++] = static_cast(srcData[index++]); + } while (index < srcSize && srcData[index] > 0); + // adjust index since it will be incremented by the for loop + --index; + } + } + return dest; +} + +inline size_t count_utf16_to_utf8(const utf16string& w) +{ + const utf16string::value_type* const srcData = &w[0]; + const size_t srcSize = w.size(); + size_t destSize(srcSize); + for (size_t index = 0; index < srcSize; ++index) + { + const utf16string::value_type ch(srcData[index]); + if (ch <= 0x7FF) + { + if (ch > 0x7F) // 2 bytes needed (11 bits used) + { + ++destSize; + } + } + // Check for high surrogate. + else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes needed (21 bits used) + { + ++index; + if (index == srcSize) + { + throw std::range_error("UTF-16 string is missing low surrogate"); + } + + const auto lowSurrogate = srcData[index]; + if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) + { + throw std::range_error("UTF-16 string has invalid low surrogate"); + } + + destSize += 2; + } + else // 3 bytes needed (16 bits used) + { + destSize += 2; + } + } + + return destSize; +} + +std::string /*__cdecl conversions::*/utf16_to_utf8(const utf16string& w) +{ + const size_t srcSize = w.size(); + const utf16string::value_type* const srcData = &w[0]; + std::string dest(count_utf16_to_utf8(w), '\0'); + std::string::value_type* const destData = &dest[0]; + size_t destIndex(0); + + for (size_t index = 0; index < srcSize; ++index) + { + const utf16string::value_type src = srcData[index]; + if (src <= 0x7FF) + { + if (src <= 0x7F) // single byte character + { + destData[destIndex++] = static_cast(src); + } + else // 2 bytes needed (11 bits used) + { + destData[destIndex++] = static_cast(char((src >> 6) | 0xC0)); // leading 5 bits + destData[destIndex++] = static_cast(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits + } + } + // Check for high surrogate. + else if (src >= H_SURROGATE_START && src <= H_SURROGATE_END) + { + const auto highSurrogate = src; + const auto lowSurrogate = srcData[++index]; + + // To get from surrogate pair to Unicode code point: + // - subtract 0xD800 from high surrogate, this forms top ten bits + // - subtract 0xDC00 from low surrogate, this forms low ten bits + // - add 0x10000 + // Leaves a code point in U+10000 to U+10FFFF range. + uint32_t codePoint = highSurrogate - H_SURROGATE_START; + codePoint <<= 10; + codePoint |= lowSurrogate - L_SURROGATE_START; + codePoint += SURROGATE_PAIR_START; + + // 4 bytes needed (21 bits used) + destData[destIndex++] = static_cast((codePoint >> 18) | 0xF0); // leading 3 bits + destData[destIndex++] = static_cast(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits + destData[destIndex++] = static_cast(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits + destData[destIndex++] = static_cast((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits + } + else // 3 bytes needed (16 bits used) + { + destData[destIndex++] = static_cast((src >> 12) | 0xE0); // leading 4 bits + destData[destIndex++] = static_cast(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits + destData[destIndex++] = static_cast((src & LOW_6BITS) | BIT8); // trailing 6 bits + } + } + + return dest; +} From d124ca9f5b14458fdd110a96aee06e42178f4b3e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 3 Aug 2021 23:00:06 -0700 Subject: [PATCH 107/254] allow float32 conversion in QS interface --- src/microsoft/quicksand.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 099ce1808..a439197b7 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -302,7 +302,8 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP Type targetPrecType = (Type) targetPrec; if (targetPrecType == Type::packed16 || targetPrecType == Type::packed8avx2 - || targetPrecType == Type::packed8avx512) { + || targetPrecType == Type::packed8avx512 + || (targetPrecType == Type::float32 && addLsh)) { // only allow non-conversion to float32 if we also use the LSH graph->packAndSave(outputFile, configStr.str(), targetPrecType); std::cerr << "Conversion Finished." << std::endl; } else { From e025bfb07c471c7505d0cc38e718f0f708783539 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 6 Aug 2021 08:02:18 +0000 Subject: [PATCH 108/254] Merged PR 20070: Run regression tests in Azure Pipelines The changes proposed in this pull request: * Added regression testing with internal models into Azure Pipelines on both Windows and Ubuntu * Created https://machinetranslation.visualstudio.com/Marian/_git/marian-prod-tests (more tests will be added over time) * Made regression test outputs (all `.log`, `.out`, `.diff` files) available for inspection as a downloadable artifact. * Made `--build-info` option available in CMake-based Windows builds Warning: I tried to handle multiple cases, but some regression tests may occasionally fail, especially tests using avx2 or avx512 models, because the outputs are system/CPU dependent. I think it's better to merge this already, monitoring the stability of tests, and adding expected outputs variations if necessary, improving the coverage and stability of regression tests over time. --- CMakeLists.txt | 3 + azure-pipelines.yml | 288 ++++++++++++++++++++++++++++++++--- src/common/config_parser.cpp | 8 +- 3 files changed, 276 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 870fb70be..4e6f24c74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -624,6 +624,9 @@ execute_process(COMMAND rm ${CMAKE_CURRENT_SOURCE_DIR}/src/common/build_info.cpp OUTPUT_QUIET ERROR_QUIET) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/common/build_info.cpp.in ${CMAKE_CURRENT_BINARY_DIR}/src/common/build_info.cpp @ONLY) +# to be able to check if this is a CMake-based compilation, which always adds +# build-info option, even on Windows. +add_definitions(-DBUILD_INFO_AVAILABLE=1) # Compile source files include_directories(${marian_SOURCE_DIR}/src) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a11988189..4f7ce02da 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,25 +15,37 @@ pool: name: Azure Pipelines variables: - BOOST_ROOT_WINDOWS: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64" - CUDA_PATH_WINDOWS: "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA" - MKL_DIR: "$(Build.SourcesDirectory)/mkl" - MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" - VCPKG_COMMIT: c69096659f49e2b1aca532ea5c2f8c135182519b - VCPKG_DIR: "$(Build.SourcesDirectory)/vcpkg" - VCPKG_PACKAGES: "protobuf" + - group: marian-prod-tests + - name: BOOST_ROOT_WINDOWS + value: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64" + - name: BOOST_URL + value: "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe" + - name: CUDA_PATH_WINDOWS + value: "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA" + - name: MKL_DIR + value: "$(Build.SourcesDirectory)/mkl" + - name: MKL_URL + value: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" + - name: VCPKG_COMMIT + value: c69096659f49e2b1aca532ea5c2f8c135182519b + - name: VCPKG_DIR + value: "$(Build.SourcesDirectory)/vcpkg" + - name: VCPKG_PACKAGES + value: "protobuf" # The Visual Studio installation directory can be found using: # pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\" # for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x # popd - VS_PATH: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise" + - name: VS_PATH + value: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise" stages: -- stage: Build +- stage: Builds jobs: ###################################################################### - - job: Windows + - job: BuildWindows + displayName: Windows strategy: matrix: @@ -62,21 +74,31 @@ stages: Expand-Archive -Force mkl.zip $(MKL_DIR) displayName: Download MKL - ## Cache for vcpkg packages. It does not work yet properly due to linker errors after restoring it. + ## Cache for Boost #- task: Cache@2 - # displayName: Cache + # displayName: Cache Boost # inputs: # # Change the first value (v0) to another value to clear the cache - # key: 'v0 | "$(VCPKG_PACKAGES)" | vcpkg | "$(Agent.OS)"' - # path: $(VCPKG_DIR) + # key: '"v0" | "boost" | "$(BOOST_URL)" | "$(BOOST_ROOT_WINDOWS)" | "$(Agent.OS)"' + # path: $(BOOST_ROOT_WINDOWS) + # cacheHitVar: CACHE_BOOST_RESTORED # Boost is no longer pre-installed on Azure/GitHub-hosted Windows runners - pwsh: | Write-Host "Downloading Boost to $(BOOST_ROOT_WINDOWS)" - $Url = "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe" - C:\msys64\usr\bin\wget.exe -nv $Url -O "$(Pipeline.Workspace)/boost.exe" + C:\msys64\usr\bin\wget.exe -nv "$(BOOST_URL)" -O "$(Pipeline.Workspace)/boost.exe" Start-Process -Wait -FilePath "$(Pipeline.Workspace)/boost.exe" "/SILENT","/SP-","/SUPPRESSMSGBOXES","/DIR=$(BOOST_ROOT_WINDOWS)" displayName: Download Boost + condition: ne(variables.CACHE_BOOST_RESTORED, 'true') + + ## Cache for vcpkg packages. It does not work yet properly due to linker errors after restoring it. + #- task: Cache@2 + # displayName: Cache vcpkg + # inputs: + # # Change the first value (v0) to another value to clear the cache + # key: '"v0" | "vcpkg" | "$(VCPKG_COMMIT)" | "$(VCPKG_PACKAGES)" | "$(Agent.OS)"' + # path: $(VCPKG_DIR) + # cacheHitVar: CACHE_VCPKG_RESTORED - pwsh: | git clone https://github.com/Microsoft/vcpkg.git $(VCPKG_DIR) @@ -91,6 +113,7 @@ stages: Remove-Item $(VCPKG_DIR)\downloads -Force -Recurse -ErrorAction SilentlyContinue Remove-Item $(VCPKG_DIR)\buildtrees -Force -Recurse -ErrorAction SilentlyContinue displayName: Prepare vcpkg + condition: ne(variables.CACHE_VCPKG_RESTORED, 'true') - script: | :: Load VS environment @@ -402,12 +425,12 @@ stages: - checkout: self submodules: true - # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev - # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev + # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev + # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler displayName: Install packages - # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html + # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - bash: | wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" @@ -444,3 +467,230 @@ stages: ls -lah * displayName: Check targets workingDirectory: install + + +# Marian is built in the same job where the regression tests are run to make sure that executables +# is compiled and run on a machine with the same CPU architecture, which is required for +# compilations with FBGEMM. +- stage: Tests + jobs: + + ###################################################################### + - job: TestWindows + displayName: Windows CPU+FBGEMM + + pool: + vmImage: windows-latest + + steps: + # Due to multiple checkouts this will be commonly cloned into D:\a\1\s\marian-dev + - checkout: self + submodules: true + + - pwsh: | + C:\msys64\usr\bin\wget.exe -nv $(MKL_URL) -O mkl.zip + Expand-Archive -Force mkl.zip $(MKL_DIR) + displayName: Download MKL + + # Cache for vcpkg packages + - task: Cache@2 + displayName: Cache vcpkg + inputs: + # Change the first value (v0) to another value to clear the cache + key: '"v0" | "vcpkg" | "$(VCPKG_COMMIT)" | "$(VCPKG_PACKAGES)" | "$(Agent.OS)"' + path: $(VCPKG_DIR) + cacheHitVar: CACHE_VCPKG_RESTORED + + - pwsh: | + git clone https://github.com/Microsoft/vcpkg.git $(VCPKG_DIR) + cd $(VCPKG_DIR) + git checkout $(VCPKG_COMMIT) + pushd + .\bootstrap-vcpkg.bat -disableMetrics + popd + # Install packages + .\vcpkg.exe install --triplet x64-windows-static $(VCPKG_PACKAGES) + # Clean to make the cache smaller + Remove-Item $(VCPKG_DIR)\downloads -Force -Recurse -ErrorAction SilentlyContinue + Remove-Item $(VCPKG_DIR)\buildtrees -Force -Recurse -ErrorAction SilentlyContinue + displayName: Prepare vcpkg + condition: ne(variables.CACHE_VCPKG_RESTORED, 'true') + + - script: | + :: Load VS environment + call "$(VS_PATH)/VC/Auxiliary/Build/vcvarsall.bat" x64 + :: Create build directory + mkdir build + cd build + :: Run CMake + cmake .. -G Ninja ^ + -DCMAKE_BUILD_TYPE="Slim" ^ + -DCMAKE_C_COMPILER="cl.exe" ^ + -DCMAKE_CXX_COMPILER="cl.exe" ^ + -DCMAKE_MAKE_PROGRAM="ninja.exe" ^ + -DCMAKE_TOOLCHAIN_FILE="$(VCPKG_DIR)\scripts\buildsystems\vcpkg.cmake" ^ + -DVCPKG_TARGET_TRIPLET="x64-windows-static" ^ + ^ + -DCOMPILE_CPU="TRUE" ^ + -DCOMPILE_CUDA="FALSE" ^ + ^ + -DUSE_FBGEMM="TRUE" ^ + -DUSE_SENTENCEPIECE="TRUE" ^ + -DUSE_STATIC_LIBS="TRUE" + displayName: Configure CMake + env: + # Set envvars so that CMake can find the installed packages + MKLROOT: $(MKL_DIR) + workingDirectory: marian-dev + + - script: | + pwd + call "$(VS_PATH)/VC/Auxiliary/Build/vcvarsall.bat" x64 + ninja + displayName: Compile + workingDirectory: marian-dev/build + + ## Publish an artifact with Marian executables. Disabled because it is not portable due to + ## fbgemm and not needed at the moment + #- pwsh: Compress-Archive -Path marian*.exe spm_*.exe -DestinationPath $(Build.SourcesDirectory)/marian-dev-ci_windows-x64_cpu.zip + #displayName: Prepare binaries + #workingDirectory: marian-dev/build + #- publish: marian-dev-ci_windows-x64_cpu.zip + #artifact: marian-dev-ci_windows-x64_cpu + #displayName: Publish binaries + + # Running regression tests + # Due to multiple checkouts this will be commonly cloned into D:\a\1\s\marian-prod-tests + - checkout: git://Marian/marian-prod-tests + + # Collect details about the CPU architecture, etc. + # Because the outputs goes to *.log files, they will be also included in the artifact with test outputs. + - script: bash -c "cat /proc/cpuinfo | tee cpuinfo.log" + displayName: Machine statistics + workingDirectory: marian-prod-tests + + - bash: | + cd models + bash download-models.sh + ls + displayName: Prepare tests + env: + AWS_SECRET_SAS_TOKEN: $(blob-sas-token) + workingDirectory: marian-prod-tests + + # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' + # instead of '/', which often breaks the job + - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + continueOnError: true + displayName: Run tests + workingDirectory: marian-prod-tests + + - bash: | + # cut -c3- removes './' from paths making 7z to retain the directory structure + find . -type f \( -name "*.log" -o -name "*.out" -o -name "*.diff" \) -print | cut -c3- > listing.txt + echo "Creating an artifact with the following files:" + cat listing.txt + 7z a -tzip ../regression-tests-ci_windows-x64_cpu.zip @listing.txt + displayName: Collect outputs + workingDirectory: marian-prod-tests + + - publish: regression-tests-ci_windows-x64_cpu.zip + artifact: regression-tests-ci_windows-x64_cpu + displayName: Publish outputs + + ###################################################################### + - job: TestLinux + displayName: Linux CPU+FBGEMM + + pool: + vmImage: ubuntu-latest + + steps: + # Due to multiple checkouts this will be commonly cloned into D:\a\1\s\marian-dev + - checkout: self + submodules: true + + # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev + # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev + - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-8 g++-8 + displayName: Install packages + + # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html + - bash: | + wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - + sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" + sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" + sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 + displayName: Install MKL + + # Note that COMPILE_CPU=on and USE_SENTENCEPUECE=on are set explicitly to make them detectable + # by the regression tests framework (not sure if it is still required) + - bash: | + mkdir -p install + mkdir -p build + cd build + CC=/usr/bin/gcc-8 CXX=/usr/bin/g++-8 \ + cmake .. \ + -DCMAKE_BUILD_TYPE=slim \ + -DCOMPILE_CPU=on \ + -DCOMPILE_CUDA=off \ + -DUSE_FBGEMM=on \ + -DUSE_SENTENCEPIECE=on \ + -DUSE_STATIC_LIBS=on + displayName: Configure CMake + workingDirectory: marian-dev + + - bash: make -j3 + displayName: Compile + workingDirectory: marian-dev/build + + ## Publish an artifact with Marian executables. Disabled because it is not portable due to + ## fbgemm and not needed at the moment + #- bash: zip $(Build.SourcesDirectory)/marian-dev-ci_linux-x64-static_cpu.zip marian* spm_* + #displayName: Prepare binaries + #workingDirectory: marian-dev/build + #- publish: marian-dev-ci_linux-x64-static_cpu.zip + #artifact: marian-dev-ci_linux-x64-static_cpu + #displayName: Publish binaries + + # Running regression tests + # Due to multiple checkouts this will be commonly cloned into D:\a\1\s\marian-prod-tests + - checkout: git://Marian/marian-prod-tests + + # Collect details about the CPU architecture, etc. + # Because the outputs goes to *.log files, they will be also included in the artifact with test outputs. + - bash: | + echo ">>> lscpu" + lscpu | tee lscpu.log + echo ">>> cpuinfo" + cat /proc/cpuinfo | tee cpuinfo.log + /usr/bin/gcc-8 --version | tee gcc.log + displayName: Machine statistics + workingDirectory: marian-prod-tests + + - bash: | + cd models + bash download-models.sh + ls + displayName: Prepare tests + env: + AWS_SECRET_SAS_TOKEN: $(blob-sas-token) + workingDirectory: marian-prod-tests + + - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + continueOnError: true + displayName: Run tests + workingDirectory: marian-prod-tests + + - bash: | + # cut -c3- removes './' from paths making 7z to retain the directory structure + find . -type f \( -name "*.log" -o -name "*.out" -o -name "*.diff" \) -print | cut -c3- > listing.txt + echo "Creating an artifact with the following files:" + cat listing.txt + 7z a -tzip ../regression-tests-ci_linux-x64-static_cpu.zip @listing.txt + displayName: Collect outputs + workingDirectory: marian-prod-tests + + - publish: regression-tests-ci_linux-x64-static_cpu.zip + artifact: regression-tests-ci_linux-x64-static_cpu + displayName: Publish outputs diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 870bf52d5..d7818afb4 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -983,15 +983,15 @@ Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) auto buildInfo = get("build-info"); if(!buildInfo.empty() && buildInfo != "false") { -#ifndef _MSC_VER // cmake build options are not available on MSVC based build. +#ifdef BUILD_INFO_AVAILABLE // cmake build options are not available on MSVC based build. if(buildInfo == "all") std::cerr << cmakeBuildOptionsAdvanced() << std::endl; else std::cerr << cmakeBuildOptions() << std::endl; exit(0); -#else // _MSC_VER - ABORT("build-info is not available on MSVC based build."); -#endif // _MSC_VER +#else // BUILD_INFO_AVAILABLE + ABORT("build-info is not available on MSVC based build unless compiled via CMake."); +#endif // BUILD_INFO_AVAILABLE } // get paths to extra config files From 6652b310b10fb7cff916fda2e1da4836dde701a9 Mon Sep 17 00:00:00 2001 From: Rohit Jain Date: Thu, 2 Sep 2021 05:20:56 +0000 Subject: [PATCH 109/254] Merged PR 20560: Update SPM in Marian Update SPM in Marian --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 28f9eb890..c307b874d 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 28f9eb890f62907406c629acd2f04ca9b71442c9 +Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3 From 8d0a3c0c2749f234acf60a6c33fa93d5918f8fe7 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 7 Sep 2021 11:11:58 +0100 Subject: [PATCH 110/254] Add --allow-unauthenticated when installing CUDA (#878) --- scripts/ci/install_cuda_ubuntu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index 8dc77eda0..b058294ae 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -91,7 +91,7 @@ sudo add-apt-repository "deb ${REPO_URL} /" sudo apt-get update echo "Installing CUDA packages ${CUDA_PACKAGES}" -sudo apt-get -y install ${CUDA_PACKAGES} +sudo apt-get -y --allow-unauthenticated install ${CUDA_PACKAGES} if [[ $? -ne 0 ]]; then echo "CUDA Installation Error." From 4dd30b5065efba61fc044e9dc4303205c9d2ac53 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 8 Sep 2021 14:02:21 +0100 Subject: [PATCH 111/254] Factor concatenation improvements and documentation (#748) * concatenation combining option added when embeding using factors * crossMask not used by default * added an option to better clarify when choosing factor predictor options * fixed bug when choosing re-embedding option and not setting embedding size * avoid uncessary string copy * Check in factors documentation * Fix duplication in merge * Self-referential repository * change --factors-predictor to --lemma-dependency. Default behaviour changed. * factor related options are now stored with the model * Update doc/factors.md * add backward compability for the target factors * Move backward compatibility checks for factors to happen after the model.npz config is loaded * Add explicit error msg if using concat on target * Update func comments. Fix spaces * Add Marian version requirement * delete experimental code Co-authored-by: Pedro Coelho Co-authored-by: Pedro Coelho Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + doc/factors.md | 218 ++++++++++++++++++++++++++++++++ src/common/config.cpp | 15 +++ src/common/config_parser.cpp | 7 + src/data/corpus_base.cpp | 49 ------- src/data/corpus_base.h | 3 - src/data/factored_vocab.cpp | 38 ++++++ src/data/factored_vocab.h | 14 +- src/layers/embedding.cpp | 74 ++++++++--- src/layers/embedding.h | 2 + src/layers/output.cpp | 44 +++---- src/models/encoder_classifier.h | 3 + src/models/encoder_decoder.cpp | 3 + src/models/encoder_pooler.h | 3 + src/models/s2s.h | 4 +- src/models/transformer.h | 10 +- 16 files changed, 377 insertions(+), 111 deletions(-) create mode 100644 doc/factors.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 05658fe10..e0b853144 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Expresion graph documentation (#788) - Graph operators documentation (#801) - Remove unused variable from expression graph +- Factor groups and concatenation: doc/factors.md ## [1.10.0] - 2021-02-06 diff --git a/doc/factors.md b/doc/factors.md new file mode 100644 index 000000000..59e14b682 --- /dev/null +++ b/doc/factors.md @@ -0,0 +1,218 @@ +# Using marian with factors + +Following this README should allow the user to train a model with source and/or target side factors. To train with factors, the data must be formatted in a certain way. A special vocabulary file format is also required, and its extension should be `.fsv` as providing a source and/or target vocabulary file with this extension is what triggers the usage of source and/or target factors. See details below. + +### Requirements: + +In order to use factors in Marian, you should use at least Marian 1.9.0 unless you want to use a factors functionality that requires setting one of the following command line options to their non default values: `--factors-combine`, `-—factors-dim-emb` and `--lemma-dependency` as they were only introduced after Marian 1.10.20+. + +## Define factors + +Factors should be organized in "groups," where each group represents a different feature. For example, there could be a group denoting capitalization and another denoting subword divisions. + +Factors within a single group should start with the same string. + +For example, for a capitalization factor group, the individual factors could be: + +* `c0`: all lowercase + +* `c1`: first character capitalized, rest lowercase + +* `c2`: all uppercase + +If there were a second factor group for subword divisions, the individual factors could be: + +* `s0`: end of word, whitespace should follow + +* `s1`: join token with next subword + +There is no limit on the number of factor groups barring some practical limitations having to do with how the vocabulary is stored by `marian`. If the limit is exceeded `marian` will throw an error. + +Factor group zero is always the actual words in the text, referred to as *lemmas*. + +## Data preparation + +Factors are appended to the *lemmas* with a pipe `|`. The pipe also separates factors of multiple groups. + +Example sentence: + +``` +Trump tested positive for COVID-19. +``` + +Preprocessed sentence: +``` +trump test@@ ed positive for c@@ o@@ v@@ i@@ d - 19 . +``` + +Apply factors: +``` +trump|c1|s0 test|c0|s1 ed|c0|s0 positive|c0|s0 for|c0|s0 c|c2|s1 o|c2|s1 v|c2|s1 i|c2|s1 d|c2|s0 -|c0|s0 19|c0|s0 .|c0|s0 +``` + + +## Create the factored vocabulary + +Factored vocabularies must have the extension `.fsv`. How to structure the vocabulary file is described below. If using factors only on the source or target side, the vocabulary of the other side can be a normal `json`, `yaml`, etc. + +The `.fsv` vocabulary must have two sections: + +1. **Factors** + + The factor groups are defined with an underscore prepended. The colon indicates which factor group each factor inherits from. `_has_c` is used in the definition of the words in the vocabulary (see #2 below) to indicate that that word has that factor group. The `_lemma` factor is used for the words/tokens themselves; this must be present. + + ``` + _lemma + + _c + c0 : _c + c1 : _c + c2 : _c + _has_c + + _s + s0 : _s + s1 : _s + _has_s + ``` + +2. **Lemmas** + + These are the vocabulary entries themselves. They have the format of `LEMMA : _lemma [_has_c] [_has_s]`. The `_has_X` should only apply to lemmas that can have an `X` factor anywhere in the data (which will likely be all of the tokens except `` and ``). + + Examples: + ``` + : _lemma + : _lemma + , : _lemma _has_c _has_s + . : _lemma _has_c _has_s + the : _lemma _has_c _has_s + for : _lemma _has_c _has_s + ``` + + +#### Other Requirements + +Certain characters are used by the `.fsv` vocabulary that will have to be escaped/replaced in the data: `#:_\|` + +The tokens in the factor vocabularies (`c0`, `c1`, `s0`, etc.) cannot be present in any of the *lemmas*. + +### Full `.fsv` file + +Putting everything together, the final `.fsv` file should look like this. It can have comments (lines started by `#`). + + ``` + # factors + +_lemma + +_c +c0 : _c +c1 : _c +c2 : _c +_has_c + +_s +s0 : _s +s1 : _s +_has_s + + # lemmas + + : _lemma + : _lemma +, : _lemma _has_c _has_s +. : _lemma _has_c _has_s +the : _lemma _has_c _has_s +for : _lemma _has_c _has_s + ``` + +## Training options + +There are two choices for how factor embeddings are combined with *lemma* embeddings: summation and concatenation. + +``` +--factors-combine TEXT=sum How to combine the factors and lemma embeddings. + Options available: sum, concat +``` + +The dimension of the factor embeddings must be specified if using combine option `concat`. If using `sum`, the factor embedding dimension matches that of the lemmas. + +``` +--factors-dim-emb INT Embedding dimension of the factors. Only used if concat is selected as factors combining form +``` + +Note: At the moment `concat` is only implemented for usage in the source side. + +### Prediction + +If using factors on the target side, there are multiple options for how factor predictions are generated related to the form of conditioning / dependencies of factors and lemmas. If no option is set with `--lemma-dependency`, the default behavior will be predicting the factors with no lemma dependency. + +``` +--lemma-dependency TEXT Lemma dependency method to use when predicting target factors. + Options: soft-transformer-layer, hard-transformer-layer, lemma-dependent-bias, re-embedding + +--lemma-dim-emb INT=0 Re-embedding dimension of lemma in factors +``` + +* `soft-transformer-layer`: Uses an additional transformer layer to predict the factors using the previously predicted lemma +* `hard-transformer-layer`: Like `soft-transformer-layer` but with hard-max +* `lemma-dependent-bias`: Adds a learned bias term based on the predicted lemma to the logits of the factors. There is no additional transformer layer introduced with this option +* `re-embedding`: After predicting a lemma, re-embed the lemma and add this new vector before predicting the factors +* `lemma-dim-emb`: Controls the dimension of the re-embedded lemma when using the option `re-embedding` + + +### Weight tying + +If you use factors both on the source and target side, and the factors are the same for both sides you can tie the embeddings exactly as you do for non factored models. + +If factors are used only on one side (either source or target) with a joint vocabulary, there are two options for tying source and target embedding weights: + +1. Use combine option `concat` (If using factors only on the source side). +2. Use combine option `sum`, and create "dummy" factors on the non-factorized side. This entails creating a factored vocabulary where the same number of factors are present as are on the side with meaningful factors. In the previous example, if we have the capitalization and subword factors on the source side, the target side would have five different dummy factors (they can all be in the same group). In the *lemma* section of the `.fsv` file we would just not put `_has_X` for any lemma. + + ``` + # factors + + _lemma + + _d + d0 : _d + d1 : _d + d2 : _d + d3 : _d + d4 : _d + _has_d + + # lemmas + + : _lemma + : _lemma + , : _lemma + . : _lemma + le : _lemma + pour : _lemma + ``` + +## Examples +Some examples of possible commands to train factored models in marian: +* Using factors on both source and target. Using `sum` to combine lemma and factor embeddings. No tied embeddings and no lemma dependency when predicting the factors: +``` +path_to/build/marian -t corpus.fact.{src,trg} \ + -v vocab.{src,trg}.fsv +``` +* Using factors only on the source side. Using `concat` to combine lemma and factor embeddings. Source, target and output embeddings matrices tied: +``` +path_to/build/marian -t corpus.fact.src corpus.trg \ + -v vocab.src.fsv vocab.trg.yml \ + --factors-combine concat \ + --factors-dim-emb 8 \ + --tied-embeddings-all +``` +* Using factors only on the target side. Using `sum` to combine lemma and factor embeddings. Target and output embedding matrices tied. Predicting factors with `soft-transformer-layer` lemma dependency: +``` +path_to/build/marian -t corpus.src corpus.fact.trg \ + -v vocab.src.yml vocab.fsv.trg \ + --tied-embeddings \ + --lemma-dependency soft-transformer-layer +``` diff --git a/src/common/config.cpp b/src/common/config.cpp index a1c4ed5ac..9878c70b0 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -116,6 +116,21 @@ void Config::initialize(ConfigParser const& cp) { config_["tsv-fields"] = tsvFields; } + // ensures factors backward compatibility whilst keeping the more user friendly CLI + if(get("lemma-dependency").empty()) { + YAML::Node config; + int lemmaDimEmb = get("lemma-dim-emb"); + if(lemmaDimEmb > 0) { + config_["lemma-dependency"] = "re-embedding"; + } else if(lemmaDimEmb == -1) { + config_["lemma-dependency"] = "lemma-dependent-bias"; + } else if(lemmaDimEmb == -2) { + config_["lemma-dependency"] = "soft-transformer-layer"; + } else if(lemmaDimEmb == -3) { + config_["lemma-dependency"] = "hard-transformer-layer"; + } + } + // echo full configuration log(); diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index d7818afb4..30d77e369 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -195,6 +195,13 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--dim-emb", "Size of embedding vector", 512); + cli.add("--factors-dim-emb", + "Embedding dimension of the factors. Only used if concat is selected as factors combining form"); + cli.add("--factors-combine", + "How to combine the factors and lemma embeddings. Options available: sum, concat", + "sum"); + cli.add("--lemma-dependency", + "Lemma dependency method to use when predicting target factors. Options: soft-transformer-layer, hard-transformer-layer, lemma-dependent-bias, re-embedding"); cli.add("--lemma-dim-emb", "Re-embedding dimension of lemma in factors", 0); diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 5f9a9ee36..9d95a1214 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -566,54 +566,5 @@ void SentenceTuple::setWeights(const std::vector& weights) { weights_ = weights; } -// experimental: hide inline-fix source tokens from cross attention -std::vector SubBatch::crossMaskWithInlineFixSourceSuppressed() const -{ - const auto& srcVocab = *vocab(); - - auto factoredVocab = vocab()->tryAs(); - size_t inlineFixGroupIndex = 0, inlineFixSrc = 0; - auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc); - - auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG]; - auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG]; - auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG]; - auto unkId = srcVocab.getUnkId(); - auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId; - - auto m = mask(); // default return value, which we will modify in-place below in case we need to - if (hasInlineFixFactors || hasInlineFixTags) { - LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens"); - - // example: force French translation of name "frank" to always be "franck" - // - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to - // - hasInlineFixTags: " frank franck ", "frank" and all tags cannot be cross-attended to - auto dimBatch = batchSize(); // number of sentences in the batch - auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch - const auto& d = data(); - size_t numWords = 0; - for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries - bool inside = false; - for (size_t s = 0; s < dimWidth; s++) { // loop over source positions - auto i = locate(/*batchIdx=*/b, /*wordPos=*/s); - if (!m[i]) - break; - numWords++; - // keep track of entering/exiting the inline-fix source tags - auto w = d[i]; - if (w == fixSrcId) - inside = true; - else if (w == fixTgtId) - inside = false; - bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc; - if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor) - m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens - } - } - ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??"); - } - return m; -} - } // namespace data } // namespace marian diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 251df5bc6..63a6fb990 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -236,9 +236,6 @@ class SubBatch { } void setWords(size_t words) { words_ = words; } - - // experimental: hide inline-fix source tokens from cross attention - std::vector crossMaskWithInlineFixSourceSuppressed() const; }; /** diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index cc7159938..e05f31225 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -663,6 +663,44 @@ std::string FactoredVocab::surfaceForm(const Words& sentence) const /*override f return res; } +/** + * Auxiliary function that return the total number of factors (no lemmas) in a factored vocabulary. + * @return number of factors + */ +size_t FactoredVocab::getTotalFactorCount() const { + return factorVocabSize() - groupRanges_[0].second; +} + +/** + * Decodes the indexes of lemma and factor for each word and outputs that information separately. + * It will return two data structures that contain separate information regarding lemmas and factors indexes + * by receiving a list with the word indexes of a batch. + * @param[in] words vector of words + * @param[out] lemmaIndices lemma index for each word + * @param[out] factorIndices factor usage information for each word (1 if the factor is used 0 if not) + */ +void FactoredVocab::lemmaAndFactorsIndexes(const Words& words, std::vector& lemmaIndices, std::vector& factorIndices) const { + lemmaIndices.reserve(words.size()); + factorIndices.reserve(words.size() * getTotalFactorCount()); + + auto numGroups = getNumGroups(); + std::vector lemmaAndFactorIndices; + + for (auto &word : words) { + if (vocab_.contains(word.toWordIndex())) { // skip invalid combinations in the space (can only happen during initialization) --@TODO: add a check? + word2factors(word, lemmaAndFactorIndices); + lemmaIndices.push_back((IndexType) lemmaAndFactorIndices[0]); // save the lemma vocabulary index + for (size_t g = 1; g < numGroups; g++) { // loop over the different factors group + auto factorIndex = lemmaAndFactorIndices[g]; // get the vocabulary index of the factor of group g + ABORT_IF(factorIndex == FACTOR_NOT_SPECIFIED, "Attempted to embed a word with a factor not specified"); + for (int i = 0; i < factorShape_[g] - 1; i++) { // loop over all factors in group g + factorIndices.push_back((float) (factorIndex == i)); // fill the factor indexes array with '0' if the factor is not used in a given word, '1' if it is + } + } + } + } +} + // create a CSR matrix M[V,U] from words[] with M[v,u] = 1 if factor u is a factor of word v // This is used to form the embedding of a multi-factor token. // That embedding is a sum of the embeddings of the individual factors. diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h index 6b96d8cd3..b644ce4c4 100644 --- a/src/data/factored_vocab.h +++ b/src/data/factored_vocab.h @@ -49,12 +49,13 @@ class FactoredVocab : public IVocab { virtual size_t lemmaSize() const override; CSRData csr_rows(const Words& words) const; // sparse matrix for summing up factors from the concatenated embedding matrix for each word - + void lemmaAndFactorsIndexes(const Words& words, std::vector& lemmaIndices, std::vector& factorIndices) const; #ifdef FACTOR_FULL_EXPANSION const CSRData& getGlobalFactorMatrix() const { return globalFactorMatrix_; } // [v,u] (sparse) -> =1 if u is factor of v --only used in getLogits() #endif size_t getNumGroups() const { return groupRanges_.size(); } - std::pair getGroupRange(size_t g) const { return groupRanges_[g]; } // [g] -> (u_begin, u_end) + std::pair getGroupRange(size_t g) const { return groupRanges_[g]; } // [g] -> (u_begin, u_end) + size_t getTotalFactorCount() const; #ifdef FACTOR_FULL_EXPANSION const std::vector& getGapLogMask() const { return gapLogMask_; } // [v] -inf if v is a gap entry, else 0 #endif @@ -80,15 +81,6 @@ class FactoredVocab : public IVocab { Word string2word(const std::string& w) const; bool tryGetFactor(const std::string& factorGroupName, size_t& groupIndex, size_t& factorIndex) const; // note: factorGroupName given without separator - // some hard-coded constants from FactoredSegmenter - // The naming mimics the names in FactoredSegmenter.cs, and therefore intentionally does not follow Marian conventions. - // @TODO: We have more hard-coded constants throughout the code. Move them all here. - // @TODO: figure out how to do this with static const*/constexpr -#define FactoredVocab_INLINE_FIX_WHAT_serialized "is" -#define FactoredVocab_FIX_SRC_ID_TAG "" -#define FactoredVocab_FIX_TGT_ID_TAG "" -#define FactoredVocab_FIX_END_ID_TAG "" - private: void constructGroupInfoFromFactorVocab(); void constructFactorIndexConversion(); diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index 92c4ad6d2..26d6b7fe3 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -8,19 +8,31 @@ Embedding::Embedding(Ptr graph, Ptr options) std::string name = opt("prefix"); int dimVoc = opt("dimVocab"); int dimEmb = opt("dimEmb"); + int dimFactorEmb = opt("dimFactorEmb"); bool fixed = opt("fixed", false); + // Embedding layer initialization should depend only on embedding size, hence fanIn=false + auto initFunc = inits::glorotUniform( + /*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length + factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get("vocab", "")); if(factoredVocab_) { dimVoc = (int)factoredVocab_->factorVocabSize(); LOG_ONCE(info, "[embedding] Factored embeddings enabled"); + if(opt("factorsCombine") == "concat") { + ABORT_IF(dimFactorEmb == 0, + "Embedding: If concatenation is chosen to combine the factor embeddings, a factor " + "embedding size must be specified."); + int numberOfFactors = (int)factoredVocab_->getTotalFactorCount(); + dimVoc -= numberOfFactors; + FactorEmbMatrix_ + = graph_->param("factor_" + name, {numberOfFactors, dimFactorEmb}, initFunc, fixed); + LOG_ONCE(info, + "[embedding] Combining lemma and factors embeddings with concatenation enabled"); + } } - // Embedding layer initialization should depend only on embedding size, hence fanIn=false - auto initFunc = inits::glorotUniform( - /*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length - if(options_->has("embFile")) { std::string file = opt("embFile"); if(!file.empty()) { @@ -32,6 +44,26 @@ Embedding::Embedding(Ptr graph, Ptr options) E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed); } +/** + * Embeds a sequence of words (given as indices), where they have factor information. The matrices are concatenated + * @param words vector of words + * @returns Expression that is the concatenation of the lemma and factor embeddings + */ +/*private*/ Expr Embedding::embedWithConcat(const Words& data) const { + auto graph = E_->graph(); + std::vector lemmaIndices; + std::vector factorIndices; + factoredVocab_->lemmaAndFactorsIndexes(data, lemmaIndices, factorIndices); + auto lemmaEmbs = rows(E_, lemmaIndices); + int dimFactors = FactorEmbMatrix_->shape()[0]; + auto factEmbs + = dot(graph->constant( + {(int)data.size(), dimFactors}, inits::fromVector(factorIndices), Type::float32), + FactorEmbMatrix_); + + return concatenate({lemmaEmbs, factEmbs}, -1); +} + // helper to embed a sequence of words (given as indices) via factored embeddings Expr Embedding::multiRows(const Words& data, float dropProb) const { auto graph = E_->graph(); @@ -61,7 +93,9 @@ std::tuple Embedding::apply(Ptrgraph(); int dimBatch = (int)subBatch->batchSize(); - int dimEmb = E_->shape()[-1]; + int dimEmb = (factoredVocab_ && opt("factorsCombine") == "concat") + ? E_->shape()[-1] + FactorEmbMatrix_->shape()[-1] + : E_->shape()[-1]; int dimWidth = (int)subBatch->batchWidth(); // factored embeddings: @@ -96,14 +130,8 @@ std::tuple Embedding::apply(Ptrdata(), {dimWidth, dimBatch, dimEmb}); -#if 1 + auto batchMask = graph->constant({dimWidth, dimBatch, 1}, inits::fromVector(subBatch->mask())); -#else // @TODO: this is dead code now, get rid of it - // experimental: hide inline-fix source tokens from cross attention - auto batchMask - = graph->constant({dimWidth, dimBatch, 1}, - inits::fromVector(subBatch->crossMaskWithInlineFixSourceSuppressed())); -#endif // give the graph inputs readable names for debugging and ONNX batchMask->set_name("data_" + std::to_string(/*batchIndex_=*/0) + "_mask"); @@ -112,8 +140,12 @@ std::tuple Embedding::apply(Ptrget("dropout", 0.0f)); // [(B*W) x E] - selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E] + Expr selectedEmbs; + if(opt("factorsCombine") == "concat") + selectedEmbs = embedWithConcat(words); // [(B*W) x E] + else + selectedEmbs = multiRows(words, options_->get("dropout", 0.0f)); // [(B*W) x E] + selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E] // selectedEmbs = dropout(selectedEmbs, options_->get("dropout", 0.0f), { // selectedEmbs->shape()[-3], 1, 1 }); // @TODO: replace with factor dropout return selectedEmbs; @@ -141,13 +173,15 @@ Expr Embedding::applyIndices(const std::vector& embIdx, const Shape& /*private*/ Ptr EncoderDecoderLayerBase::createEmbeddingLayer() const { // clang-format off auto options = New( - "dimVocab", opt>("dim-vocabs")[batchIndex_], - "dimEmb", opt("dim-emb"), - "dropout", dropoutEmbeddings_, - "inference", inference_, - "prefix", (opt("tied-embeddings-src") || opt("tied-embeddings-all")) ? "Wemb" + "dimVocab", opt>("dim-vocabs")[batchIndex_], + "dimEmb", opt("dim-emb"), + "dropout", dropoutEmbeddings_, + "inference", inference_, + "prefix", (opt("tied-embeddings-src") || opt("tied-embeddings-all")) ? "Wemb" : prefix_ + "_Wemb", - "fixed", embeddingFix_, + "fixed", embeddingFix_, + "dimFactorEmb", opt("factors-dim-emb"), // for factored embeddings + "factorsCombine", opt("factors-combine"), // for factored embeddings "vocab", opt>("vocabs")[batchIndex_]); // for factored embeddings // clang-format on if(options_->hasAndNotEmpty("embedding-vectors")) { diff --git a/src/layers/embedding.h b/src/layers/embedding.h index 2fa7b78d0..d34c7ffb9 100644 --- a/src/layers/embedding.h +++ b/src/layers/embedding.h @@ -12,8 +12,10 @@ class FactoredVocab; // EncoderDecoderLayerBase, which knows to pass on all required parameters from options. class Embedding : public LayerBase, public IEmbeddingLayer { Expr E_; + Expr FactorEmbMatrix_; // Factors embedding matrix if combining lemma and factors embeddings with concatenation Ptr factoredVocab_; Expr multiRows(const Words& data, float dropProb) const; + Expr embedWithConcat(const Words& data) const; bool inference_{false}; public: diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 92cccdfb6..4d6e488a4 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -36,12 +36,12 @@ namespace mlp { b_ = graph_->param(name + "_b", {1, numOutputClasses}, inits::zeros()); /*const*/ int lemmaDimEmb = options_->get("lemma-dim-emb", 0); + std::string lemmaDependency = options_->get("lemma-dependency", ""); ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary"); - if(lemmaDimEmb > 0) { // > 0 means to embed the (expected) word with a different embedding matrix -#define HARDMAX_HACK -#ifdef HARDMAX_HACK - lemmaDimEmb = lemmaDimEmb & 0xfffffffe; // hack to select hard-max: use an odd number -#endif + if(lemmaDependency == "re-embedding") { // embed the (expected) word with a different embedding matrix + ABORT_IF( + lemmaDimEmb <= 0, + "In order to predict factors by re-embedding them, a lemma-dim-emb must be specified."); auto range = factoredVocab_->getGroupRange(0); auto lemmaVocabDim = (int)(range.second - range.first); auto initFunc = inits::glorotUniform( @@ -109,8 +109,12 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { std::vector> allLogits(numGroups, nullptr); // (note: null entries for absent factors) Expr input1 = input; // [B... x D] - Expr Plemma = nullptr; // used for lemmaDimEmb=-1 - Expr inputLemma = nullptr; // used for lemmaDimEmb=-2, -3 + Expr Plemma = nullptr; // used for lemmaDependency = lemma-dependent-bias + Expr inputLemma = nullptr; // used for lemmaDependency = hard-transformer-layer and soft-transformer-layer + + std::string factorsCombine = options_->get("factors-combine", ""); + ABORT_IF(factorsCombine == "concat", "Combining lemma and factors embeddings with concatenation on the target side is currently not supported"); + for(size_t g = 0; g < numGroups; g++) { auto range = factoredVocab_->getGroupRange(g); if(g > 0 && range.first == range.second) // empty entry @@ -130,9 +134,8 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorB = slice(b_, -1, Slice((int)range.first, (int)range.second)); } /*const*/ int lemmaDimEmb = options_->get("lemma-dim-emb", 0); - if((lemmaDimEmb == -2 || lemmaDimEmb == -3) - && g > 0) { // -2/-3 means a gated transformer-like structure (-3 = hard-max) - LOG_ONCE(info, "[embedding] using lemma conditioning with gate"); + std::string lemmaDependency = options_->get("lemma-dependency", ""); + if((lemmaDependency == "soft-transformer-layer" || lemmaDependency == "hard-transformer-layer") && g > 0) { // this mimics one transformer layer // - attention over two inputs: // - e = current lemma. We use the original embedding vector; specifically, expectation @@ -229,7 +232,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { allLogits[g] = New(factorLogits, nullptr); // optionally add a soft embedding of lemma back to create some lemma dependency // @TODO: if this works, move it into lazyConstruct - if(lemmaDimEmb == -2 && g == 0) { // -2 means a gated transformer-like structure + if(lemmaDependency == "soft-transformer-layer" && g == 0) { LOG_ONCE(info, "[embedding] using lemma conditioning with gate, soft-max version"); // get expected lemma embedding vector auto factorLogSoftmax = logsoftmax( @@ -239,7 +242,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D] - } else if(lemmaDimEmb == -3 && g == 0) { // same as -2 except with hard max + } else if(lemmaDependency == "hard-transformer-layer" && g == 0) { LOG_ONCE(info, "[embedding] using lemma conditioning with gate, hard-max version"); // get max-lemma embedding vector auto maxVal = max(factorLogits, @@ -249,29 +252,22 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D] - } else if(lemmaDimEmb == -1 && g == 0) { // -1 means learn a lemma-dependent bias + } else if(lemmaDependency == "lemma-dependent-bias" && g == 0) { ABORT_IF(shortlist_, "Lemma-dependent bias with short list is not yet implemented"); LOG_ONCE(info, "[embedding] using lemma-dependent bias"); auto factorLogSoftmax = logsoftmax(factorLogits); // (we do that again later, CSE will kick in) auto z = /*stopGradient*/ (factorLogSoftmax); Plemma = exp(z); // [B... x U] - } else if(lemmaDimEmb > 0 && g == 0) { // > 0 means learn a re-embedding matrix + } else if(lemmaDependency == "re-embedding" && g == 0) { + ABORT_IF( + lemmaDimEmb <= 0, + "In order to predict factors by re-embedding them, a lemma-dim-emb must be specified."); LOG_ONCE(info, "[embedding] enabled re-embedding of lemma, at dim {}", lemmaDimEmb); // compute softmax. We compute logsoftmax() separately because this way, computation will be // reused later via CSE auto factorLogSoftmax = logsoftmax(factorLogits); auto factorSoftmax = exp(factorLogSoftmax); -#ifdef HARDMAX_HACK - bool hardmax = (lemmaDimEmb & 1) - != 0; // odd value triggers hardmax for now (for quick experimentation) - if(hardmax) { - lemmaDimEmb = lemmaDimEmb & 0xfffffffe; - LOG_ONCE(info, "[embedding] HARDMAX_HACK enabled. Actual dim is {}", lemmaDimEmb); - auto maxVal = max(factorSoftmax, -1); - factorSoftmax = eq(factorSoftmax, maxVal); - } -#endif // re-embedding lookup, soft-indexed by softmax Expr e; if(shortlist_) { // short-listed version of re-embedding matrix diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h index 4cfc54f1d..5c8ddb5a2 100644 --- a/src/models/encoder_classifier.h +++ b/src/models/encoder_classifier.h @@ -139,6 +139,9 @@ class EncoderClassifier : public EncoderClassifierBase { modelFeatures_.insert("ulr-trainable-transformation"); modelFeatures_.insert("ulr-dim-emb"); modelFeatures_.insert("lemma-dim-emb"); + modelFeatures_.insert("lemma-dependency"); + modelFeatures_.insert("factors-combine"); + modelFeatures_.insert("factors-dim-emb"); } virtual Ptr getOptions() override { return options_; } diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 8fc9321af..66ff16cec 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -62,6 +62,9 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("ulr-dim-emb"); modelFeatures_.insert("lemma-dim-emb"); modelFeatures_.insert("output-omit-bias"); + modelFeatures_.insert("lemma-dependency"); + modelFeatures_.insert("factors-combine"); + modelFeatures_.insert("factors-dim-emb"); } std::vector>& EncoderDecoder::getEncoders() { diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index 1baa8560c..8a2123430 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -149,6 +149,9 @@ class EncoderPooler : public EncoderPoolerBase { modelFeatures_.insert("ulr-trainable-transformation"); modelFeatures_.insert("ulr-dim-emb"); modelFeatures_.insert("lemma-dim-emb"); + modelFeatures_.insert("lemma-dependency"); + modelFeatures_.insert("factors-combine"); + modelFeatures_.insert("factors-dim-emb"); } virtual Ptr getOptions() override { return options_; } diff --git a/src/models/s2s.h b/src/models/s2s.h index 7009fad54..104f946c9 100644 --- a/src/models/s2s.h +++ b/src/models/s2s.h @@ -318,7 +318,9 @@ class DecoderS2S : public DecoderBase { } last("vocab", opt>("vocabs")[batchIndex_]); // for factored outputs last("lemma-dim-emb", opt("lemma-dim-emb", 0)); // for factored outputs - + last("lemma-dependency", opt("lemma-dependency", "")); // for factored outputs + last("factors-combine", opt("factors-combine", "")); // for factored outputs + last("output-omit-bias", opt("output-omit-bias", false)); // assemble layers into MLP and apply to embeddings, decoder context and diff --git a/src/models/transformer.h b/src/models/transformer.h index a792de8ba..7ec40dc58 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -295,7 +295,8 @@ class Transformer : public EncoderOrDecoderBase { kh = cache_[prefix + "_keys"]; // then return cached tensor } else { - auto Wk = graph_->param(prefix + "_Wk", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); + int dimKeys = keys->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation + auto Wk = graph_->param(prefix + "_Wk", {dimKeys, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bk = graph_->param(prefix + "_bk", {1, dimModel}, inits::zeros()); kh = affine(keys, Wk, bk); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] @@ -309,7 +310,8 @@ class Transformer : public EncoderOrDecoderBase { && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) { vh = cache_[prefix + "_values"]; } else { - auto Wv = graph_->param(prefix + "_Wv", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); + int dimValues = values->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation + auto Wv = graph_->param(prefix + "_Wv", {dimValues, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bv = graph_->param(prefix + "_bv", {1, dimModel}, inits::zeros()); vh = affine(values, Wv, bv); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim] @@ -661,7 +663,9 @@ class DecoderTransformer : public Transformer { "vocab", opt>("vocabs")[batchIndex_], // for factored outputs "output-omit-bias", opt("output-omit-bias", false), "output-approx-knn", opt>("output-approx-knn", {}), - "lemma-dim-emb", opt("lemma-dim-emb", 0)); // for factored outputs + "lemma-dim-emb", opt("lemma-dim-emb", 0), // for factored outputs + "lemma-dependency", opt("lemma-dependency", ""), // for factored outputs + "factors-combine", opt("factors-combine", "")); // for factored outputs if(opt("tied-embeddings") || opt("tied-embeddings-all")) outputFactory.tieTransposed(opt("tied-embeddings-all") || opt("tied-embeddings-src") ? "Wemb" : prefix_ + "_Wemb"); From 8470c16bdd92797e75e7d5e7397ef7b29896538a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 16 Sep 2021 02:00:00 +0000 Subject: [PATCH 112/254] Merged PR 20230: Add option for running regression tests only in Azure Pipelines This PR adds a checkbox which can be unchecked to skip running compilation checks when triggering them manually. It is useful for generating expected outputs on different CPUs for tests using 8-bit models. --- azure-pipelines.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4f7ce02da..d4d0b2e57 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,6 +6,13 @@ # 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file # 4. "More actions" > "Save" +parameters: +# Allow skipping the entire 'Build' stage +- name: runBuilds + displayName: Run builds? Uncheck to run regression tests only. + type: boolean + default: true + # The pipeline CI trigger is set on the branch master only and PR trigger on a # (non-draft) pull request to any branch trigger: @@ -45,6 +52,7 @@ stages: ###################################################################### - job: BuildWindows + condition: eq(${{ parameters.runBuilds }}, true) displayName: Windows strategy: @@ -180,6 +188,7 @@ stages: ###################################################################### - job: BuildUbuntu + condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu timeoutInMinutes: 90 @@ -322,6 +331,7 @@ stages: ###################################################################### - job: BuildUbuntuMinimal + condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5 pool: @@ -368,6 +378,7 @@ stages: ###################################################################### - job: BuildMacOS + condition: eq(${{ parameters.runBuilds }}, true) displayName: macOS CPU clang pool: @@ -416,6 +427,7 @@ stages: ###################################################################### - job: BuildInstall + condition: eq(${{ parameters.runBuilds }}, true) displayName: Linux CPU library install pool: From aa58ba8e239d228d539734e6be8266fbb3181044 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 20 Sep 2021 13:14:24 +0000 Subject: [PATCH 113/254] Merged PR 20593: Fix and update Azure pipelines - Add `--allow-unauthenticated` to `apt` when installing CUDA on Ubuntu - Removing `ubuntu-16.04` image from Azure pipelines, which will become unavailable after September 20 --- azure-pipelines.yml | 26 +++++++------------------- scripts/ci/install_cuda_ubuntu.sh | 2 +- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d4d0b2e57..7953b282b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -246,17 +246,7 @@ stages: examples: true static: true ################################################################ - # Ubuntu 16.04 supports CUDA 8+ - "16.04 CUDA 9.2 gcc-7": - image: ubuntu-16.04 - boost: true - cpu: true - gpu: true - cuda: 9.2 - gcc: 7 - unit_tests: true - examples: true - static: false + # Ubuntu 16.04 is no longer available on Azure-hosted machines pool: vmImage: $(image) @@ -332,18 +322,16 @@ stages: ###################################################################### - job: BuildUbuntuMinimal condition: eq(${{ parameters.runBuilds }}, true) - displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5 + displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5 pool: - vmImage: ubuntu-16.04 + vmImage: ubuntu-18.04 steps: - checkout: self submodules: true # The script simplifies installation of different versions of CUDA. - # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9. - # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work... - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0" displayName: Install CUDA @@ -356,10 +344,10 @@ stages: # GCC 5 is the minimum version supported - bash: | - /usr/bin/gcc-5 --version + /usr/bin/gcc-7 --version mkdir -p build cd build - CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \ + CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \ ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \ -DCOMPILE_CPU=on \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 @@ -592,7 +580,7 @@ stages: # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' # instead of '/', which often breaks the job - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests @@ -689,7 +677,7 @@ stages: AWS_SECRET_SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index 8dc77eda0..b058294ae 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -91,7 +91,7 @@ sudo add-apt-repository "deb ${REPO_URL} /" sudo apt-get update echo "Installing CUDA packages ${CUDA_PACKAGES}" -sudo apt-get -y install ${CUDA_PACKAGES} +sudo apt-get -y --allow-unauthenticated install ${CUDA_PACKAGES} if [[ $? -ne 0 ]]; then echo "CUDA Installation Error." From d796a3c3b7779993660e672f2a47f5cdd685a174 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 28 Sep 2021 17:17:12 +0000 Subject: [PATCH 114/254] Merged PR 20839: Do not ignore ignoreEOS for spm decoding With final space this eliminates trailing whitespace caused by appending EOS --- src/data/sentencepiece_vocab.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 090d478b2..8f774c2bb 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -236,18 +236,20 @@ class SentencePieceVocab : public IVocab { return words; } - std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override { + std::string decode(const Words& sentence, bool ignoreEOS) const override { std::string line; if(keepEncoded_) { // i.e. keep the sentence segmented into subword units for(const Word& id : sentence) - line += (*this)[id] + " "; + if(!ignoreEOS || id != getEosId()) + line += (*this)[id] + " "; line.pop_back(); // trim the trailing whitespace } else { // convert vector of Word to vector of int std::vector spmSentence; spmSentence.reserve(sentence.size()); for(auto&& word : sentence) - spmSentence.push_back(word.toWordIndex()); + if(!ignoreEOS || word != getEosId()) + spmSentence.push_back(word.toWordIndex()); spm_->Decode(spmSentence, &line); } return line; From 03fe1758763c99dd55bcf6c1c5e0e1dd60ae4e1a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 28 Sep 2021 17:19:07 +0000 Subject: [PATCH 115/254] Merged PR 20879: Adjustable ffn width and depth in transformer decoder --- src/common/config_parser.cpp | 8 +++++++- src/models/encoder_decoder.cpp | 2 ++ src/models/transformer.h | 21 +++++++++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index d7818afb4..b3e8950b9 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -255,10 +255,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", - 2048); + 2048); + cli.add("--transformer-decoder-dim-ffn", + "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", + 0); cli.add("--transformer-ffn-depth", "Depth of filters (transformer)", 2); + cli.add("--transformer-decoder-ffn-depth", + "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0", + 0); cli.add("--transformer-ffn-activation", "Activation between filters: swish or relu (transformer)", "swish"); diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 8fc9321af..a7a398e75 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); modelFeatures_.insert("transformer-dim-ffn"); + modelFeatures_.insert("transformer-decoder-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); + modelFeatures_.insert("transformer-decoder-ffn-depth"); modelFeatures_.insert("transformer-ffn-activation"); modelFeatures_.insert("transformer-dim-aan"); modelFeatures_.insert("transformer-aan-depth"); diff --git a/src/models/transformer.h b/src/models/transformer.h index a792de8ba..2393ad731 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -400,7 +400,7 @@ class Transformer : public EncoderOrDecoderBase { opt("transformer-heads"), /*cache=*/false); } - Expr LayerFFN(std::string prefix, Expr input) const { + Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const { int dimModel = input->shape()[-1]; float dropProb = inference_ ? 0 : opt("transformer-dropout"); @@ -408,13 +408,22 @@ class Transformer : public EncoderOrDecoderBase { auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb); auto actName = opt("transformer-ffn-activation"); + int dimFfn = opt("transformer-dim-ffn"); int depthFfn = opt("transformer-ffn-depth"); - float ffnDropProb - = inference_ ? 0 : opt("transformer-dropout-ffn"); - + if(isDecoder) { + int decDimFfn = opt("transformer-decoder-dim-ffn", 0); + if(decDimFfn != 0) + dimFfn = decDimFfn; + + int decDepthFfn = opt("transformer-decoder-ffn-depth", 0); + if(decDepthFfn != 0) + depthFfn = decDepthFfn; + } + ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn); - + + float ffnDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f); // the stack of FF layers @@ -861,7 +870,7 @@ class DecoderTransformer : public Transformer { // remember decoder state decoderStates.push_back(decoderState); - query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] checkpoint(query); } From 12a1bfaf6fc1a4e1011727413b8c04e816a73d43 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 11 Oct 2021 16:59:52 +0100 Subject: [PATCH 116/254] Remove Ubuntu 16.04 from GitHub workflows (#879) * Add --allow-unauthenticated when installing CUDA * Remove workflow with Ubuntu 16.04 --- .github/workflows/ubuntu.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 5353c2144..a7f233ca6 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -50,14 +50,7 @@ jobs: unit_tests: false examples: false # Ubuntu 16.04 supports CUDA 8+ - - name: "Ubuntu 16.04 CUDA 9.2 gcc-7" - os: ubuntu-16.04 - cuda: "9.2" - gcc: 7 - cpu: true - gpu: true - unit_tests: true - examples: true + # But it is no longer available in GitHub workflows runs-on: ${{ matrix.os }} name: ${{ matrix.name }} From 2d79ad02bb66d7e0ba264defbf5ff9b47c70ba74 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 13 Oct 2021 20:20:14 +0000 Subject: [PATCH 117/254] Merged PR 20933: beam & batch works for n on-factored models --- src/layers/output.cpp | 22 ++++++++++++++++------ src/translator/beam_search.cpp | 5 +++-- src/translator/nth_element.cpp | 2 ++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 92cccdfb6..af72b7941 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -313,14 +313,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } return Logits(std::move(allLogits), factoredVocab_); } else if(shortlist_) { - return Logits(affineOrDot(input, - shortlist_->getCachedShortWt(), - shortlist_->getCachedShortb(), + const Shape &inputShape = input->shape(); + assert(inputShape[1] == 1); // time dimension always 1 for decoding + input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]}); + + Expr Wt = shortlist_->getCachedShortWt(); + Expr b = shortlist_->getCachedShortb(); + Expr ret = affineShortlist(input, + Wt, + b, false, - /*transB=*/isLegacyUntransposedW ? false : true)); + /*transB=*/isLegacyUntransposedW ? false : true); + const Shape &retShape = ret->shape(); + assert(retShape[2] == 1); // time dimension always 1 for decoding + ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]}); + return Logits(ret); } else { - return Logits( - affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true)); + Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true); + return Logits(ret); } } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 2a0d3947a..580895f2f 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); @@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr graph, Ptr auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step // main loop over output time steps for (size_t t = 0; ; t++) { + //std::cerr << "\nstep=" << t << std::endl; ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??"); // determine beam size for next output time step, as max over still-active sentences // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp index 237d9b9da..dbcceec47 100644 --- a/src/translator/nth_element.cpp +++ b/src/translator/nth_element.cpp @@ -3,7 +3,9 @@ * SPDX-License-Identifier: MIT */ +#include "common/utils.h" #include "translator/nth_element.h" + #include #include #include From 7f06f3c5d2035dac0cb4349bf29fbfa3e6bb5448 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 26 Oct 2021 11:20:41 +0000 Subject: [PATCH 118/254] Merged PR 21166: Keep building on macOS-10.15 Marian does not compile on macOS 11.6, so the build has stopped working due to an upgrade from macOS-10.15 to macOS 11.6 in Azure Pipelines: https://github.com/actions/virtual-environments/issues/4060 This PR explicitly set macOS 10.15 in the workflow. --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7953b282b..bc76f85c9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -370,7 +370,7 @@ stages: displayName: macOS CPU clang pool: - vmImage: macos-latest + vmImage: macos-10.15 steps: - checkout: self From 1404201926b5b4e27993776d52dfac809e8556f4 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 26 Oct 2021 20:25:39 +0000 Subject: [PATCH 119/254] Merged PR 21151: Cleaning up fp16 behavior This PR improves clipping and pruning behavior of NaNs and Infs during fp16 training, ultimately avoiding the underflow problems that we were facing so far. --- src/common/aliases.cpp | 4 +- src/common/config_parser.cpp | 6 +- src/common/definitions.h | 10 +- src/models/transformer.h | 15 ++- src/tensors/cpu/tensor_operators.cpp | 4 + src/tensors/gpu/element.cu | 12 +- src/tensors/gpu/tensor_operators.cu | 147 ++++++++++++++++++------- src/tensors/tensor_operators.h | 19 ++++ src/training/graph_group.cpp | 118 ++++++++++---------- src/training/graph_group.h | 17 ++- src/training/graph_group_async.cpp | 6 +- src/training/graph_group_singleton.cpp | 8 +- src/training/graph_group_sync.cpp | 8 +- 13 files changed, 233 insertions(+), 141 deletions(-) diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 0be26a8c8..99574fe11 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -29,8 +29,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { cli.alias("fp16", "true", [&](YAML::Node& config) { if(mode_ == cli::mode::training) { config["precision"] = std::vector({"float16", "float32"}); // inference type, optimization type, save type - // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor - config["cost-scaling"] = std::vector({"0", "1000", "2", "0.05", "10", "1e-5"}); + // scaling factor, frequency, multiplier at increase, minium scaling factor + config["cost-scaling"] = std::vector({"256.f", "1000", "2.f", "256.f"}); } else { config["precision"] = std::vector({"float16"}); // for inference we do not need the other types } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index b3e8950b9..51764cdc4 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -522,15 +522,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " - "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f"); + "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", {"float32", "float32"}); cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " - "power of 2, scaling window, scaling factor, tolerance, range, minimum factor") - ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f"); + "scaling factor, frequency, multiplier, minimum factor") + ->implicit_val("256.f 1000 2.f 256.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", diff --git a/src/common/definitions.h b/src/common/definitions.h index d2cf8aa41..d8a3ad465 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -106,24 +106,24 @@ using Weak = std::weak_ptr; /** @brief Creates shared_ptr of any type, passes all arguments to any available * constructor */ template -Ptr New(Args&&... args) { - return Ptr(new T(std::forward(args)...)); +inline Ptr New(Args&&... args) { + return std::make_shared(std::forward(args)...); } template -Ptr New(Ptr p) { +inline Ptr New(Ptr p) { return Ptr(p); } /** @brief Creates InstrusivePtr of any type, passes all arguments to any available * constructor */ template -IPtr INew(Args&&... args) { +inline IPtr INew(Args&&... args) { return IPtr(new T(std::forward(args)...)); } template -IPtr INew(Ptr p) { +inline IPtr INew(Ptr p) { return IPtr(p); } diff --git a/src/models/transformer.h b/src/models/transformer.h index 2393ad731..b2c0f6be5 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -147,8 +147,7 @@ class Transformer : public EncoderOrDecoderBase { int dimDepth = dimModel / dimHeads; - auto output - = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); + auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth] } @@ -361,9 +360,9 @@ class Transformer : public EncoderOrDecoderBase { Expr LayerAttention(std::string prefix, Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] - const Expr& keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] - const Expr& values, // ...? - const Expr& mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] + Expr keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + Expr values, // ...? + Expr mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] int dimHeads, bool cache = false, bool saveAttentionWeights = false) { @@ -373,6 +372,12 @@ class Transformer : public EncoderOrDecoderBase { auto opsPre = opt("transformer-preprocess"); auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb); + // fixes missing norm for keys and values in self-attention with pre-norm + if(input == keys) + keys = output; + if(input == values) + values = output; + // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1afb8f648..f3964f917 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr /*allocator*/, bool& /*isNaN*/, b ABORT("Not implemented"); } +bool SanitizeGradient(marian::Tensor /*in*/, Ptr /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) { + ABORT("Not implemented"); +} + template void CopyCastTo(To* out, const From* in, int length) { for(int i = 0; i < length; ++i) diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu index 6790efd4b..e9cbe0812 100755 --- a/src/tensors/gpu/element.cu +++ b/src/tensors/gpu/element.cu @@ -29,7 +29,9 @@ __global__ void gElement( indices[i] = tensors[i].shape().bindex(dims); } - tensors[0].data()[index] = functional::apply(functor, tensors, indices); + // This performs the internal application of the functor in float32 regardless of the input type. + // It seems there are no speed penalties but improved precision. + tensors[0].data()[index] = (T)functional::applyWithCast(functor, tensors, indices); } } } @@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) { ElementTyped(functor, out, tensors...); } else if(out->type() == Type::float16) { #if COMPILE_FP16 - std::vector ts({out, tensors...}); - bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; }); - if(div2) { - ElementTyped(functor, out, tensors...); - } else { - ElementTyped(functor, out, tensors...); - } + ElementTyped(functor, out, tensors...); #else ABORT("FP16 not supported with chosen current hardware or CUDA version"); #endif diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index d55214bc7..1347c3bbe 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -16,15 +16,12 @@ namespace gpu { namespace atomics { static inline __device__ void atomicAdd(float *address, float val) { - //*address += val; ::atomicAdd(address, val); } #if COMPILE_FP16 // @TODO: copied from CuTorch, adapt this better, give credit. static inline __device__ void atomicAdd(half *address, half val) { - //*address += val; - #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10 ::atomicAdd(address, val); #else // __CUDA_ARCH__ < 700 @@ -50,7 +47,8 @@ static inline __device__ void atomicAdd(half *address, half val) { } while (assumed != old); #endif // __CUDA_ARCH__ } -#endif +#endif // COMPILE_FP16 + } @@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr allocator, bool& isNaN, bool& isInf) cudaStreamSynchronize(0); } +template +__global__ void gSanitizeGradient(T* in, int length, + bool* isNaN, bool* isInf, + bool pruneNaN, bool clipInf, + float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) { + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + float v = (float)in[index]; + // handle NaN + if(isnan(v)) { + if(pruneNaN) { + in[index] = (T)forNaN; + } else { + *isNaN = true; + } + } + // handle +/- Inf + if(isinf(v)) { + if(clipInf) { + in[index] = v > 0 ? (T)forInf : (T)forInfNeg; + } else { + *isInf = true; + } + } + } + } +} + +// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. +// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. +// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. +// In that case infinities do not result in a bad gradient, since they get clipped. +// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result +// in a bad gradient. +// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), +// we return `false` indicating a bad gradient. +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { + cudaSetDevice(in->getDeviceId().no); + + int length = in->size(); + + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + auto mem = allocator->alloc(2); + bool* dIsNaN = &mem->data()[0]; + bool* dIsInf = &mem->data()[1]; + fill(in->getBackend(), dIsNaN, dIsNaN + 2, false); + + float forNaN = 0.f; + float forInf = NumericLimits(in->type()).max; + float forInfNeg = NumericLimits(in->type()).lowest; + + if(in->type() == Type::float32) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#endif + } else { + ABORT("gSanitizeGradient for type {} not implemented", in->type()); + } + + bool isNaN, isInf; + CudaCopy(dIsNaN, dIsNaN + 1, &isNaN); + CudaCopy(dIsInf, dIsInf + 1, &isInf); + + allocator->free(mem); + + cudaStreamSynchronize(0); + + return !isNaN && !isInf; +} + template __global__ void gCopyCastTo(To* out, const From* in, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { @@ -1090,7 +1163,7 @@ void PasteRows(Tensor out, size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); -#if 1 // @TODO: make this configurable with a 'deterministic' flag +#if 0 // @TODO: make this configurable with a 'deterministic' flag // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be @@ -1355,7 +1428,7 @@ __global__ void gGRUFastForward(T* out, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOut = out + j * cols; const T* rowState = state + j * cols; @@ -1365,21 +1438,21 @@ __global__ void gGRUFastForward(T* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - T r = functional::Ops::sigmoid(xWrow[i] + sUrow[i] + b[i]); + float r = functional::Ops::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]); int k = i + cols; - T z = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); + float z = functional::Ops::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]); int l = i + 2 * cols; - T h; + float h; if(final) - h = functional::Ops::tanh(xWrow[l] + (sUrow[l] + b[l]) * r); + h = functional::Ops::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(xWrow[l] + sUrow[l] * r + b[l]); + h = functional::Ops::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]); - T out = ((T)1.f - z) * h + z * rowState[i]; - rowOut[i] = m * out + ((T)1.f - m) * rowState[i]; + float out = (1.f - z) * h + z * (float)rowState[i]; + rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]); } } } @@ -1441,7 +1514,7 @@ __global__ void gGRUFastBackward(T* outState, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOutState = outState + j * cols; T* rowOutXW = outXW + j * cols * 3; @@ -1459,56 +1532,56 @@ __global__ void gGRUFastBackward(T* outState, int k = i + cols; int l = i + 2 * cols; - T r = functional::Ops::sigmoid(rowXW[i] + rowSU[i] + b[i]); - T z = functional::Ops::sigmoid(rowXW[k] + rowSU[k] + b[k]); + float r = functional::Ops::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]); + float z = functional::Ops::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]); - T h; + float h; if(final) - h = functional::Ops::tanh(rowXW[l] + (rowSU[l] + b[l]) * r); + h = functional::Ops::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(rowXW[l] + rowSU[l] * r + b[l]); + h = functional::Ops::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]); - T adj = rowAdj[i]; + float adj = rowAdj[i]; - T t = ((T)1.f - z) * ((T)1.f - h * h); + float t = (1.f - z) * (1.f - h * h); // df/ds if(outState) - rowOutState[i] += (m * z - m + (T)1.f) * adj; + rowOutState[i] += (T)((m * z - m + 1.f) * adj); // df/d(xW_r) ... - T dfdxW_r = m * r * ((T)1.f - r) * t * adj; + float dfdxW_r = m * r * (1.f - r) * t * adj; if(final) - dfdxW_r *= rowSU[l] + b[l]; + dfdxW_r *= (float)rowSU[l] + (float)b[l]; else - dfdxW_r *= rowSU[l]; + dfdxW_r *= (float)rowSU[l]; if(outXW) - rowOutXW[i] += dfdxW_r; + rowOutXW[i] += (T)dfdxW_r; if(outSU) - rowOutSU[i] += dfdxW_r; + rowOutSU[i] += (T)dfdxW_r; if(outB) - rowOutB[i] += dfdxW_r; + rowOutB[i] += (T)dfdxW_r; // df/d(xW_z) ... - T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj; + float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj; if(outXW) - rowOutXW[k] += dfdxW_z; + rowOutXW[k] += (T)dfdxW_z; if(outSU) - rowOutSU[k] += dfdxW_z; + rowOutSU[k] += (T)dfdxW_z; if(outB) - rowOutB[k] += dfdxW_z; + rowOutB[k] += (T)dfdxW_z; // df/d(xW_x) ... - T dfdxW_x = m * t * adj; + float dfdxW_x = m * t * adj; if(outXW) - rowOutXW[l] += dfdxW_x; + rowOutXW[l] += (T)dfdxW_x; if(outSU) - rowOutSU[l] += dfdxW_x * r; + rowOutSU[l] += (T)(dfdxW_x * r); if(outB) if(final) - rowOutB[l] += dfdxW_x * r; + rowOutB[l] += (T)(dfdxW_x * r); else - rowOutB[l] += dfdxW_x; + rowOutB[l] += (T)dfdxW_x; } } } diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 6e587953c..dc29bf356 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor); DISPATCH2(AddCast, marian::Tensor, const marian::Tensor); DISPATCH4(IsNaN, const Tensor, Ptr, bool&, bool&); +#ifdef CUDA_FOUND +namespace gpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} +#endif + +namespace cpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} + +static inline bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { +#ifdef CUDA_FOUND + if(in->getBackend()->getDeviceId().type == DeviceType::gpu) + return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); + else +#endif + return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); +} + template void Element(Functor functor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index e9c977b9c..03e5acf40 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) mbRoundUp_(options_->get("mini-batch-round-up", true)) { if(options_->hasAndNotEmpty("cost-scaling")) { auto vcs = options_->get>("cost-scaling"); - costScale_ = true; - float costExponent = std::stof(vcs[0]); - costScaleFactor_ = std::pow(2.0f, costExponent); - - if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]); - if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]); - if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]); - if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]); - if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]); + + costScaling_ = true; + costScalingFactor_ = std::stof( vcs[0]); + if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]); + if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]); + if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]); LOG_ONCE(info, - "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}", - costExponent, - costScaleFactor_, - costScaleFreq_, - costScaleMultiplier_, - costScaleNanTolerance_, - costScaleNanRange_, - costScaleFactorMinimum_); + "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}", + costScalingFactor_, + costScalingFreq_, + costScalingMultiplier_, + costScalingFactorMinimum_); } if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) { @@ -96,21 +90,17 @@ void GraphGroup::initGraphsAndOpts() { // given number of iterations. Usually we increase by 2 which adds // one more bit for precision. void GraphGroup::increaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; noNanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++ - if(noNanSeen_ % costScaleFreq_ == 0) { - costScaleFactor_ *= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); + if(noNanSeen_ % costScalingFreq_ == 0) { + costScalingFactor_ *= costScalingMultiplier_; + if(isMainProcess()) + LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_); // Resetting counts after cost-scale change noNanSeen_ = 0; @@ -120,48 +110,56 @@ void GraphGroup::increaseCostScaleFactor() { // call when a NaN was seen to decrease cost-scaling factor void GraphGroup::decreaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; nanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++ - if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) { - if(costScaleFactor_ > costScaleFactorMinimum_) { - costScaleFactor_ /= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); - } else { - // @TODO: think if should this rather abort? - LOG(warn, - "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum", - nanPercent, - total, - costScaleFactor_); - } - // Resetting counts after cost-scale change - noNanSeen_ = 0; - nanSeen_ = 0; + // do not reduce cost-scaling factor below minimum + if(costScalingFactor_ > costScalingFactorMinimum_) + costScalingFactor_ /= costScalingMultiplier_; + + if(isMainProcess()) { + if(costScalingFactor_ > costScalingFactorMinimum_) + LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_); + else + LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_); } + + // Resetting counts after cost-scale change + noNanSeen_ = 0; + nanSeen_ = 0; } float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) { auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin); - if(checkGradientNan_ || costScale_) { - bool hasNan = false, hasInf = false; - IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options - if(hasNan || hasInf) { - LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf); + // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than + // the minimum. If a NaN value is seen we exit here and will reduce the factor next and + // this skips an update. + // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces + // NaNs with 0. Updates are not skipped any more. + // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type. + // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent + // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. + if(costScaling_ || checkGradientNan_) { + bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_; + bool clipInf = !checkGradientNan_; + bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf); + + // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed. + ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??"); + + if(!saneGradient) { + LOG(debug, "Found NaN"); return std::numeric_limits::quiet_NaN(); } } - + + // The optional clipping above will affect the norm here. The norm can be non-finite despite the above + // gradient sanitization, hence check again and propagate a NaN. if(dynamicGradientScaling_) { auto gNorm = L2Norm(curGrad, graphs_[i]->allocator()); if(isFinite(gNorm) && gNorm > 0.0) @@ -197,8 +195,8 @@ float GraphGroup::executeAndCollectNorm(const std::functionget("normalize-gradient")) normalizationFactor *= updateTrgWords; @@ -207,9 +205,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) return normalizationFactor; if(dynamicGradientScaling_) { - // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g) - if(costScale_) - gNorm = gNorm / costScaleFactor_; + // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g) + if(costScaling_) + gNorm = gNorm / costScalingFactor_; // Normalize gradient norm w.r.t. number of labels in batch for statistics, // there should be no gradient normalization before this point, @TODO: check this @@ -288,9 +286,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { restoreFromCheckpoint(modelFileName, scatterFn); } else if(options_->hasAndNotEmpty("pretrained-model")) { std::string nameInit = options_->get("pretrained-model"); - LOG(info, - "[training] Initializing model weights with pre-trained model {}", - nameInit); + LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit); size_t i = 0; for(auto graph : graphs_) diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 422990b16..b7f2f7efc 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -60,22 +60,21 @@ class GraphGroup { double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false - bool costScale_{false}; - float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization - size_t costScaleFreq_{2000}; - float costScaleMultiplier_{2.f}; - float costScaleNanTolerance_{0.f}; - size_t costScaleNanRange_{1}; - float costScaleFactorMinimum_{1.f}; // @TODO make this configureable + bool costScaling_{false}; + float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization + size_t costScalingFreq_{2000}; + float costScalingMultiplier_{2.f}; + float costScalingFactorMinimum_{1.f}; + size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization size_t nanSeen_{0}; + bool checkGradientNan_{false}; + bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; - bool checkGradientNan_{false}; - // determines the number of input streams (i.e. input files or fields in the TSV input) that need // to be included in the batch, i.e. without alignments and weights size_t numberOfInputFiles(); diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp index 72b06e489..f85f9cf85 100644 --- a/src/training/graph_group_async.cpp +++ b/src/training/graph_group_async.cpp @@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr batch) { thread_local Tensor accGradients; thread_local Ptr accAlloc; - ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD"); + ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD"); auto graph = graphs_[tid]; Ptr dynamicLoss = models_[tid]->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // it's ok to go out of scope, this will still insert the new top node into the graph - auto costNode = dynamicLoss->loss() * costScaleFactor_; + auto costNode = dynamicLoss->loss() * costScalingFactor_; } if(t % optimizerDelay_ == 0) { diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp index 7dc861375..162610705 100644 --- a/src/training/graph_group_singleton.cpp +++ b/src/training/graph_group_singleton.cpp @@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr batch) { auto opt = optimizerShards_[0]; auto lossNode = model->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything - auto scaledLoss = lossNode->loss() * costScaleFactor_; + auto scaledLoss = lossNode->loss() * costScalingFactor_; } graph->forward(); graph->backward(); bool noNanOrInf = true; - if(costScale_) { + if(costScaling_) { // Are there NaNs in the gradient? bool hasNan = false, hasInf = false; IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf); @@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr batch) { opt->update(graph->params()->vals(), graph->params()->grads(), batch->wordsTrg(), - costScaleFactor_); + costScalingFactor_); if(scheduler_) { scheduler_->update(*lossNode, batch); diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index 8c06761e1..c90a384e4 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num { // let loss go out of scope, frees memory auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch); - if(costScaleFactor_ != 1.f) - rationalLoss->loss() * costScaleFactor_; + if(costScalingFactor_ != 1.f) + rationalLoss->loss() * costScalingFactor_; graph->forward(); localDeviceLosses[localDeviceIndex] += *rationalLoss; @@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num graph->backward(/*zero=*/false); // (gradients are reset before we get here) } -#if 1 +#if 0 // @TODO: this can probably be removed now, keep around until confirmed. // experimental and should eventually be somewhere else // Handle local gradient explosion but only clip to largest possible value // given number of GPUs and type. Should clip rarely. Also clips inf @@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards float gradNorm = 0.f; - if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) { + if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) { // Wrapping member function auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) { return GraphGroup::checkNanOrNorm(i, begin, end); From 2bdfbd3f02fb7e85b865efdfc93a8ebaa99a7c74 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Sun, 21 Nov 2021 17:06:01 +0000 Subject: [PATCH 120/254] Update badges in README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 17a33728a..69ae220cb 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ Marian ====== -[![Build Status CUDA 9](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-9.2.svg?label=CUDA%209)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-9.2/) -[![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDA%2010)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.1/) + +[![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDA%2010.2)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/) +[![Build Status CUDA 11](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-11.4.svg?label=CUDA%2011.4)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-11.4/) [![Build Status CPU](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/) [![Tests Status](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=tests)](http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/) [![Latest release](https://img.shields.io/github/release/marian-nmt/marian.svg?label=release)](https://github.com/marian-nmt/marian/releases) From c85d0608483789d446361ea28d95f7d7c9545f2d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 22 Nov 2021 03:32:54 +0000 Subject: [PATCH 121/254] Merged PR 20729: Add top-k sampling This adds Top-K sampling to Marian and extends the --output-sampling option to take arguments --- regression-tests | 2 +- src/common/config_parser.cpp | 7 ++-- src/graph/expression_operators.cpp | 7 ++++ src/graph/expression_operators.h | 15 ++++++- src/graph/node_operators_binary.h | 61 ++++++++++++++++++++++++++- src/graph/node_operators_tuple.h | 2 +- src/models/costs.cpp | 35 ++++++++++++++++ src/models/costs.h | 32 +++++++++----- src/models/model_factory.cpp | 21 ++++++++-- src/tensors/cpu/tensor_operators.cpp | 9 +++- src/tensors/gpu/tensor_operators.cu | 63 +++++++++++++++------------- src/tensors/tensor_operators.h | 23 +++++++++- src/translator/translator.h | 2 +- 13 files changed, 226 insertions(+), 53 deletions(-) diff --git a/regression-tests b/regression-tests index 7d612ca5e..0aa7b6b76 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0 +Subproject commit 0aa7b6b7632732d1f22f3d8169d3262a7e6b1e9d diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 51764cdc4..59b328e92 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -695,9 +695,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Use softmax shortlist: path first best prune"); cli.add>("--weights", "Scorer weights"); - cli.add("--output-sampling", - "Noise output layer with gumbel noise", - false); + cli.add>("--output-sampling", + "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. " + " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.") + ->implicit_val("full"); cli.add>("--output-approx-knn", "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 560ab4e73..b26c2ae0f 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) { return Expression(a, axis, indices); } +// scatter() -- scatter arbitrary elements along an axis; batched or non-batched +// This is the reverse operation to gather. +Expr scatter(Expr a, int axis, Expr indices, Expr source) { + return Expression(a, axis, indices, source); +} + + // index_select() -- gather arbitrary elements along an axis from an unbatched // input 'a'. Indices are specified as a 1D vector. // This is used e.g. for embedding lookup. diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index e34ddc8ac..d032e8d3b 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -707,10 +707,23 @@ Expr stopGradient(Expr a); * @param indices The indices to be gathered * @returns Gathered expression with the same shape as @p indices * @note @p a and @p indices must have the same rank - * @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable. + * @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable. */ Expr gather(Expr a, int axis, Expr indices); +/** + * Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged. + * This is the reverse operation to gather. + * @param a The input expression + * @param axis The axis along which to index + * @param indices The indices to be scattered + * @param source Expression with values to scatter. + * @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices + * @note @p source and @p indices must have the same rank + * @note In this version @p source and @p indicies must have the same shape + */ +Expr scatter(Expr a, int axis, Expr indices, Expr source); + #if 0 // reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis. // with broadcasting diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index a180bb5c8..b2a646b1c 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( + // @TODO: rename to gather Select(val_, child(0)->val(), child(1)->val(), axis_))}; } NodeOps backwardOps() override { return {NodeOp( - Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; + // @TODO: rename to scatter + Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; } Shape newShape(Expr a, int axis, Expr indices) { @@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp { axis = shape.axis(axis); auto rank = shape.size(); ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape())); - axis = a->shape().axis(axis); shape.set(axis, indices->shape()[axis]); for (size_t i = 0; i < rank; ++i) { if (i != axis) { @@ -1086,6 +1087,62 @@ struct GatherNodeOp : public NaryNodeOp { int axis_; }; +struct ScatterNodeOp : public NaryNodeOp { + ScatterNodeOp(Expr a, int axis, Expr indices, Expr source) + : NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()), + axis_(a->shape().axis(axis)) { + matchOrAbort(indices->value_type()); + } + + NodeOps forwardOps() override { + return {NodeOp( + CopyCast(val_, child(0)->val()); // @TODO: use normal copy + Insert(val_, child(2)->val(), child(1)->val(), axis_) + )}; + } + + NodeOps backwardOps() override { + ABORT("backward for ScatterNodeOp not yet implemented"); + } + + Shape newShape(Expr a, int axis, Expr indices, Expr source) { + ABORT_IF(axis != -1, "only last dimensions"); + ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); + + Shape shape = a->shape(); + // @TODO: do proper checking + return shape; + } + + const std::string type() override { return "scatter"; } + + const std::string color() override { return "orange"; } + + virtual size_t hash() override { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, axis_); + hash_ = seed; + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + return true; + } + +private: + friend class SerializationHelpers; + int axis_; +}; + struct ColsNodeOp : public NaryNodeOp { ColsNodeOp(Expr a, Expr indices) : NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) { diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h index c7a9531a1..8acb1bc83 100644 --- a/src/graph/node_operators_tuple.h +++ b/src/graph/node_operators_tuple.h @@ -133,7 +133,7 @@ struct TopKNodeOp : public UnaryNodeOp, } void backward() override { - Insert(/*out*/child(0)->grad(), adj_, val_, axis_); + Insert(/*out*/child(0)->grad(), adj_, val_, axis_); } const std::string type() override { return "topk"; } diff --git a/src/models/costs.cpp b/src/models/costs.cpp index c688b2119..4b15bcb36 100644 --- a/src/models/costs.cpp +++ b/src/models/costs.cpp @@ -10,5 +10,40 @@ Ptr LogSoftmaxStep::apply(Ptr state) { return state; } +Ptr GumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [](Expr logits) { // lemma gets gumbelled + return logsoftmax(logits + constant_like(logits, inits::gumbel())); + }, + logsoftmax)); // factors don't + return state; +} + +TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {} + +Ptr TopkGumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [=](Expr logits) { // lemma gets gumbelled + // create logits-sized tensor consisting only of invalid path scores + float invalidPathScore = NumericLimits(logits->value_type()).lowest; + Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore)); + + // select top-k values + Expr val, idx; + std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true); + + // uncomment below to display probability mass in top-k selection + // debug(sum(gather(softmax(logits), -1, idx), -1), "sum"); + + // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search + Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); + + // Scatter gumbelled values back into logits to fill with usable values + return scatter(invalidLogits, -1, idx, gumbelVal); + }, + logsoftmax)); // factors don't + return state; +} + } // namespace models } // namespace marian diff --git a/src/models/costs.h b/src/models/costs.h index e5463bfd0..a087ed6af 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -297,20 +297,30 @@ class LogSoftmaxStep : public ILogProbStep { virtual Ptr apply(Ptr state) override; }; -// Gumbel-max noising for sampling during beam-search -// Seems to work well enough with beam-size=1. Turn on -// with --output-sampling during translation with marian-decoder +// Gumbel-max noising for sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling [full] during translation +// with marian-decoder for samnpling from the full +// softmax distribution. class GumbelSoftmaxStep : public ILogProbStep { public: virtual ~GumbelSoftmaxStep() {} - virtual Ptr apply(Ptr state) override { - state->setLogProbs(state->getLogProbs().applyUnaryFunctions( - [](Expr logits) { // lemma gets gumbelled - return logsoftmax(logits + constant_like(logits, inits::gumbel())); - }, - logsoftmax)); // factors don't - return state; - } + virtual Ptr apply(Ptr state) override; +}; + + +// Gumbel-max noising for top-k sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling topk [10] during translation +// with marian-decoder for top-10 sampling. +class TopkGumbelSoftmaxStep : public ILogProbStep { +private: + int k_{1}; + +public: + TopkGumbelSoftmaxStep(int k); + virtual ~TopkGumbelSoftmaxStep() {} + virtual Ptr apply(Ptr state) override; }; // class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence, diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index e176e6a4c..52a87e72a 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -370,10 +370,25 @@ Ptr createModelFromOptions(Ptr options, usage use) { // add (log)softmax if requested if (use == usage::translation) { if(std::dynamic_pointer_cast(baseModel)) { - if(options->get("output-sampling", false)) - return New(std::dynamic_pointer_cast(baseModel), New()); - else + if(options->hasAndNotEmpty("output-sampling")) { + auto sampling = options->get>("output-sampling", {}); + std::string method = sampling.size() > 0 ? sampling[0] : "full"; + + if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { + LOG(info, "Output sampling from the full softmax distribution"); + return New(std::dynamic_pointer_cast(baseModel), New()); + } else if(method == "topk") { + int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10; + if(k == 1) + LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); + LOG(info, "Output sampling via top-{} sampling", k); + return New(std::dynamic_pointer_cast(baseModel), New(k)); + } else { + ABORT("Unknown sampling method: {}", method); + } + } else { return New(std::dynamic_pointer_cast(baseModel), New()); + } } #ifdef COMPILE_EXAMPLES // note: 'usage::translation' here means 'inference' diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index f3964f917..1e1adc38b 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -739,6 +739,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -760,10 +761,16 @@ void Insert(Tensor out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axisCPU] = (int)indices->data()[idxIndex]; int outIndex = outShape.index(dims); - out->data()[outIndex] += in->data()[index]; + if(add) + out->data()[outIndex] += in->data()[index]; + else + out->data()[outIndex] = in->data()[index]; } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + void GRUFastForward(Tensor out_, std::vector inputs, bool final) { int rows = out_->shape().elements() / out_->shape().back(); int cols = out_->shape().back(); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 1347c3bbe..2103ca9de 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1309,7 +1309,7 @@ __global__ void gSelect(T* out, } } -template +template __global__ void gInsert(T* out, functional::Shape outShape, const T* in, @@ -1327,7 +1327,10 @@ __global__ void gInsert(T* out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int outIndex = outShape.index(dims); - out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + if(add) + out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + else + out[outIndex] = in[index]; } } } @@ -1349,21 +1352,21 @@ void Select(Tensor out, if(out->type() == Type::float32) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else if(out->type() == Type::uint32) { gSelect<<>>(out->data(), @@ -1378,6 +1381,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -1393,28 +1397,31 @@ void Insert(Tensor out, int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else { ABORT("Insert not implemented for type {}", out->type()); } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + template __global__ void gGRUFastForward(T* out, const T* state, diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index dc29bf356..1fc4542d8 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -297,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int) -DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int) + +#ifdef CUDA_FOUND +namespace gpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} +#endif + +namespace cpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} + +template +static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Insert(out, in, indices, axis); + else +#endif + cpu::Insert(out, in, indices, axis); +} DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); diff --git a/src/translator/translator.h b/src/translator/translator.h index db1f3d030..3e375f65d 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -119,7 +119,7 @@ class Translate : public ModelTask { threadPool.enqueue(task, device, id++); } - if(options_->get("output-sampling", false)) { + if(options_->hasAndNotEmpty("output-sampling")) { if(options_->get("beam-size") > 1) LOG(warn, "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods " From 3b4e943cda232062e7e28f496bfed724f3f20333 Mon Sep 17 00:00:00 2001 From: David Meikle Date: Mon, 22 Nov 2021 12:22:06 +0000 Subject: [PATCH 122/254] Added pragma to ignore unused-private-field error on elementType_ which failed in macOS (#872) Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + src/tensors/cpu/fbgemm/expanded_gemm.h | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0b853144..169a1a5e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed +- Added pragma to ignore unused-private-field error on elementType_ on macOS - Do not set guided alignments for case augmented data if vocab is not factored - Various fixes to enable LSH in Quicksand - Added support to MPIWrappest::bcast (and similar) for count of type size_t diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h index 2c376d6e2..cca18725d 100644 --- a/src/tensors/cpu/fbgemm/expanded_gemm.h +++ b/src/tensors/cpu/fbgemm/expanded_gemm.h @@ -292,8 +292,19 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { size_t k_; bool transA_; bool transB_; + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-private-field" +#endif + Type elementType_; +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + public: FbgemmPacked8AffineNodeOp(Type elementType, const std::vector& nodes, From 3d15cd3d2020abf561b7e0d7ffa87b15baf0dfb1 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 22 Nov 2021 06:41:16 -0800 Subject: [PATCH 123/254] Update submodule regression-tests --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 7d612ca5e..32a2f7960 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0 +Subproject commit 32a2f7960d8cc48d6c90cbb5d03fbb42eb923d3d From 1adf80b7c9d2b3fc688cf16114e5e9b01425f3a2 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Mon, 22 Nov 2021 19:19:58 +0000 Subject: [PATCH 124/254] Task alias validation during training mode (#886) * Attempt to validate task alias * Validate allowed options for --task alias * Update comment in aliases.cpp * Show allowed values for alias Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + src/common/aliases.cpp | 2 ++ src/common/cli_wrapper.cpp | 17 ++++++++++++++++- src/common/config_parser.cpp | 3 ++- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 169a1a5e1..4c6249544 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed loading binary models on architectures where `size_t` != `uint64_t`. - Missing float template specialisation for elem::Plus - Broken links to MNIST data sets +- Enforce validation for the task alias in training mode. ### Changed - Optimize LSH for speed by treating is as a shortlist generator. No option changes in decoder diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 0be26a8c8..36613327e 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -19,6 +19,8 @@ namespace marian { * As aliases are key-value pairs by default, values are compared as std::string. * If the command line option corresponding to the alias is a vector, the alias * will be triggered if the requested value exists in that vector at least once. + * By design if an option value that is not defined for that alias option below + * is used, the CLI parser will abort with 'unknown value for alias' error. * * @see CLIWrapper::alias() * diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp index 9a5a1a2c4..211dd0b92 100644 --- a/src/common/cli_wrapper.cpp +++ b/src/common/cli_wrapper.cpp @@ -132,8 +132,14 @@ void CLIWrapper::parseAliases() { if(aliases_.empty()) return; + // Find the set of values allowed for each alias option. + // Later we will check and abort if an alias option has an unknown value. + std::unordered_map> allowedAliasValues; + for(auto &&alias : aliases_) + allowedAliasValues[alias.key].insert(alias.value); + // Iterate all known aliases, each alias has a key, value, and config - for(const auto &alias : aliases_) { + for(auto &&alias : aliases_) { // Check if the alias option exists in the config (it may come from command line or a config // file) if(config_[alias.key]) { @@ -145,6 +151,15 @@ void CLIWrapper::parseAliases() { bool expand = false; if(config_[alias.key].IsSequence()) { auto aliasOpts = config_[alias.key].as>(); + // Abort if an alias option has an unknown value, i.e. value that has not been defined + // in common/aliases.cpp + for(auto &&aliasOpt : aliasOpts) + if(allowedAliasValues[alias.key].count(aliasOpt) == 0) { + std::vector allowedOpts(allowedAliasValues[alias.key].begin(), + allowedAliasValues[alias.key].end()); + ABORT("Unknown value '" + aliasOpt + "' for alias option --" + alias.key + ". " + "Allowed values: " + utils::join(allowedOpts, ", ")); + } expand = std::find(aliasOpts.begin(), aliasOpts.end(), alias.value) != aliasOpts.end(); } else { expand = config_[alias.key].as() == alias.value; diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 30d77e369..8da9520c8 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -557,7 +557,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { addSuboptionsULR(cli); cli.add>("--task", - "Use predefined set of options. Possible values: transformer, transformer-big"); + "Use predefined set of options. Possible values: transformer-base, transformer-big, " + "transformer-base-prenorm, transformer-big-prenorm"); cli.switchGroup(previous_group); // clang-format on } From ab6b8260835a9f77a7b3cd48e3af4e4039280384 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 23 Nov 2021 10:13:29 +0000 Subject: [PATCH 125/254] Add GCC 11 support (#888) * Add GCC 11 support Some C++ Standard Library headers have been changed to no longer include other headers that they do need to depend on. As such, C++ programs that used standard library components without including the right headers will no longer compile. The following headers are used less widely in libstdc++ and may need to be included explicitly when compiled with GCC 11: (for std::numeric_limits) (for std::unique_ptr, std::shared_ptr etc.) (for std::pair, std::tuple_size, std::index_sequence etc.) (for members of namespace std::this_thread.) Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 3 ++- VERSION | 2 +- src/3rd_party/fbgemm | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c6249544..921924ccc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,10 +28,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add unit tests for binary files. - Fix compilation with OMP - Compute aligned memory sizes using exact sizing -- Support for loading lexical shortlist from a binary blob +- Support for loading lexical shortlist from a binary blob - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed +- Add GCC11 support into FBGEMM - Added pragma to ignore unused-private-field error on elementType_ on macOS - Do not set guided alignments for case augmented data if vocab is not factored - Various fixes to enable LSH in Quicksand diff --git a/VERSION b/VERSION index 3c40cf565..89f579488 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.24 +v1.10.25 diff --git a/src/3rd_party/fbgemm b/src/3rd_party/fbgemm index c258054a8..6f45243cb 160000 --- a/src/3rd_party/fbgemm +++ b/src/3rd_party/fbgemm @@ -1 +1 @@ -Subproject commit c258054a87b7c9020014558bd81819b3f7104cc0 +Subproject commit 6f45243cb8ab7d7ab921af18d313ae97144618b8 From 8b8d1b11e28a421b348703d702c9c5206061df9d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 25 Nov 2021 02:33:49 +0000 Subject: [PATCH 126/254] Merged PR 21553: Parallelize data reading for training This parallelizes data reading. On very fast GPUs and with small models training speed can be starved by too slow batch creation. Use --data-threads 8 or more, by default currently set to 1 for backcompat. --- src/common/config_parser.cpp | 7 ++ src/common/utils.cpp | 8 +- src/data/batch_generator.h | 35 +++++--- src/data/corpus.cpp | 152 +++++++++++++++++++---------------- src/data/corpus.h | 3 + src/data/corpus_base.cpp | 44 +++++----- src/data/corpus_base.h | 105 ++++++++++++++++++++++-- src/data/corpus_nbest.cpp | 7 +- src/data/corpus_sqlite.cpp | 6 +- src/data/text_input.cpp | 6 +- 10 files changed, 251 insertions(+), 122 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 59b328e92..3d79f8af2 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -883,6 +883,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { if(mode_ == cli::mode::training) { cli.add("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); + + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); + // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add("--all-caps-every", "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8"); @@ -901,6 +905,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { cli.add("--mini-batch-round-up", "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); + } else { + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); } // clang-format on } diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 72624041f..99fc790a2 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -70,22 +70,20 @@ void split(const std::string& line, // the function guarantees that the output has as many elements as requested void splitTsv(const std::string& line, std::vector& fields, size_t numFields) { fields.clear(); + fields.resize(numFields); // make sure there is as many elements as requested size_t begin = 0; size_t pos = 0; for(size_t i = 0; i < numFields; ++i) { pos = line.find('\t', begin); if(pos == std::string::npos) { - fields.push_back(line.substr(begin)); + fields[i] = line.substr(begin); break; } - fields.push_back(line.substr(begin, pos - begin)); + fields[i] = line.substr(begin, pos - begin); begin = pos + 1; } - if(fields.size() < numFields) // make sure there is as many elements as requested - fields.resize(numFields); - ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line); } diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index a248db23a..ea9774682 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -2,6 +2,7 @@ #include "common/options.h" #include "common/signal_handling.h" +#include "common/timer.h" #include "data/batch_stats.h" #include "data/rng_engine.h" #include "training/training_state.h" @@ -92,6 +93,8 @@ class BatchGenerator : public RNGEngine { // this runs on a bg thread; sequencing is handled by caller, but locking is done in here std::deque fetchBatches() { + timer::Timer total; + typedef typename Sample::value_type Item; auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content @@ -135,19 +138,29 @@ class BatchGenerator : public RNGEngine { if(current_ != data_->end()) ++current_; } - size_t sets = 0; - while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data + + Samples maxiBatchTemp; + while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data if (saveAndExitRequested()) // stop generating batches return std::deque(); - maxiBatch->push(*current_); - sets = current_->size(); + + maxiBatchTemp.push_back(*current_); + // do not consume more than required for the maxi batch as this causes // that line-by-line translation is delayed by one sentence - bool last = maxiBatch->size() == maxSize; + bool last = maxiBatchTemp.size() == maxSize; if(!last) ++current_; // this actually reads the next line and pre-processes it } - size_t numSentencesRead = maxiBatch->size(); + size_t numSentencesRead = maxiBatchTemp.size(); + + size_t sets = 0; + for(auto&& s : maxiBatchTemp) { + if(!s.empty()) { + sets = s.size(); + maxiBatch->push(s); + } + } // construct the actual batches and place them in the queue Samples batchVector; @@ -163,6 +176,7 @@ class BatchGenerator : public RNGEngine { BatchStats::const_iterator cachedStatsIter; if (stats_) cachedStatsIter = stats_->begin(); + while(!maxiBatch->empty()) { // while there are sentences in the queue if (saveAndExitRequested()) // stop generating batches return std::deque(); @@ -178,12 +192,7 @@ class BatchGenerator : public RNGEngine { lengths[i] = batchVector.back()[i].size(); // record max lengths so far maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter); - // this optimization makes no difference indeed -#if 0 // sanity check: would we find the same entry if searching from the start? - auto it = stats_->lower_bound(lengths); - auto maxBatchSize1 = stats_->findBatchSize(lengths, it); - ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked"); -#endif + makeBatch = batchVector.size() >= maxBatchSize; // if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch if(batchVector.size() > maxBatchSize) { @@ -231,6 +240,8 @@ class BatchGenerator : public RNGEngine { LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.", tempBatches.size(), numSentencesRead, (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom); + LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed()); + return tempBatches; } diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index d8a364b2e..643a7de93 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -14,18 +14,30 @@ namespace data { Corpus::Corpus(Ptr options, bool translate /*= false*/, size_t seed /*= Config:seed*/) : CorpusBase(options, translate, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} Corpus::Corpus(std::vector paths, std::vector> vocabs, Ptr options, size_t seed /*= Config:seed*/) : CorpusBase(paths, vocabs, options, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; @@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { } SentenceTuple Corpus::next() { - // Used for handling TSV inputs - // Determine the total number of fields including alignments or weights - auto tsvNumAllFields = tsvNumInputFields_; - if(alignFileIdx_ > -1) - ++tsvNumAllFields; - if(weightFileIdx_ > -1) - ++tsvNumAllFields; - std::vector fields(tsvNumAllFields); - - for(;;) { // (this is a retry loop for skipping invalid sentences) + size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); + std::vector fields(numStreams); + + while(true) { // retry loop // get index of the current sentence size_t curId = pos_; // note: at end, pos_ == total size // if corpus has been shuffled, ids_ contains sentence indexes @@ -69,83 +75,91 @@ SentenceTuple Corpus::next() { curId = ids_[pos_]; pos_++; - // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); size_t eofsHit = 0; - size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); - for(size_t i = 0; i < numStreams; ++i) { - std::string line; - + for(size_t i = 0; i < numStreams; ++i) { // looping of all streams // fetch line, from cached copy in RAM or actual file if (!corpusInRAM_.empty()) { if (curId < corpusInRAM_[i].size()) - line = corpusInRAM_[i][curId]; + fields[i] = corpusInRAM_[i][curId]; else { eofsHit++; continue; } } else { - bool gotLine = io::getline(*files_[i], line).good(); + bool gotLine = io::getline(*files_[i], fields[i]).good(); if(!gotLine) { eofsHit++; continue; } } + } - if(i > 0 && i == alignFileIdx_) { - addAlignmentToSentenceTuple(line, tup); - } else if(i > 0 && i == weightFileIdx_) { - addWeightsToSentenceTuple(line, tup); - } else { - if(tsv_) { // split TSV input and add each field into the sentence tuple - utils::splitTsv(line, fields, tsvNumAllFields); - size_t shift = 0; - for(size_t j = 0; j < tsvNumAllFields; ++j) { - // index j needs to be shifted to get the proper vocab index if guided-alignment or - // data-weighting are preceding source or target sequences in TSV input - if(j == alignFileIdx_ || j == weightFileIdx_) { - ++shift; - } else { - size_t vocabId = j - shift; - bool altered; - preprocessLine(fields[j], vocabId, /*out=*/altered); - if (altered) - tup.markAltered(); - addWordsToSentenceTuple(fields[j], vocabId, tup); - } - } - - // weights are added last to the sentence tuple, because this runs a validation that needs - // length of the target sequence - if(alignFileIdx_ > -1) - addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); - if(weightFileIdx_ > -1) - addWeightsToSentenceTuple(fields[weightFileIdx_], tup); + if(eofsHit == numStreams) + return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested + ABORT_IF(eofsHit != 0, "not all input files have the same number of lines"); + + auto makeSentenceTuple = [this](size_t curId, std::vector fields) { + if(tsv_) { + // with tsv inputs data, there is only one input stream, hence we only have one field + // which needs to be tokenized into tab-separated fields + ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??"); + size_t numAllFields = tsvNumInputFields_; + if(alignFileIdx_ > -1) + ++numAllFields; + if(weightFileIdx_ > -1) + ++numAllFields; + // replace single-element fields array with extracted tsv fields + std::vector tmpFields; + utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields + fields.swap(tmpFields); + } + + // fill up the sentence tuple with sentences from all input files + SentenceTupleImpl tup(curId); + size_t shift = 0; + for(size_t i = 0; i < fields.size(); ++i) { + // index j needs to be shifted to get the proper vocab index if guided-alignment or + // data-weighting are preceding source or target sequences in TSV input + if(i == alignFileIdx_ || i == weightFileIdx_) { + ++shift; } else { + size_t vocabId = i - shift; bool altered; - preprocessLine(line, i, /*out=*/altered); + preprocessLine(fields[i], vocabId, /*out=*/altered); if (altered) tup.markAltered(); - addWordsToSentenceTuple(line, i, tup); + addWordsToSentenceTuple(fields[i], vocabId, tup); } + + // weights are added last to the sentence tuple, because this runs a validation that needs + // length of the target sequence + if(alignFileIdx_ > -1) + addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); + if(weightFileIdx_ > -1) + addWeightsToSentenceTuple(fields[weightFileIdx_], tup); } - } - - if (eofsHit == numStreams) - return SentenceTuple(0); - ABORT_IF(eofsHit != 0, "not all input files have the same number of lines"); - // check if all streams are valid, that is, non-empty and no longer than maximum allowed length - if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { - return words.size() > 0 && words.size() <= maxLength_; - })) - return tup; + // check if all streams are valid, that is, non-empty and no longer than maximum allowed length + if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { + return words.size() > 0 && words.size() <= maxLength_; + })) { + return tup; + } else { + return SentenceTupleImpl(); // return an empty tuple if above test does not pass + } + }; + + if(threadPool_) { // use thread pool if available + return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields)); + } else { // otherwise launch here and just pass the result into the wrapper + auto tup = makeSentenceTuple(curId, fields); + if(!tup.empty()) + return SentenceTuple(tup); + } - // otherwise skip this sentence and try the next one - // @TODO: tail recursion? - } + } // end of retry loop } // reset and initialize shuffled reading @@ -167,6 +181,8 @@ void Corpus::reset() { pos_ = 0; for (size_t i = 0; i < paths_.size(); ++i) { if(paths_[i] == "stdin" || paths_[i] == "-") { + std::cin.tie(0); + std::ios_base::sync_with_stdio(false); files_[i].reset(new std::istream(std::cin.rdbuf())); // Probably not necessary, unless there are some buffers // that we want flushed. diff --git a/src/data/corpus.h b/src/data/corpus.h index e8e9a9fdb..281d43a22 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -4,6 +4,7 @@ #include #include +#include "3rd_party/threadpool.h" #include "common/definitions.h" #include "common/file_stream.h" #include "common/options.h" @@ -20,6 +21,8 @@ class Corpus : public CorpusBase { private: std::vector> tempFiles_; std::vector ids_; + + UPtr threadPool_; // thread pool for parallelized data reading // for shuffle-in-ram bool shuffleInRAM_{false}; diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 5f9a9ee36..bfce31bf9 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -12,7 +12,24 @@ typedef std::vector MaskBatch; typedef std::pair WordMask; typedef std::vector SentBatch; -CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {} +void SentenceTupleImpl::setWeights(const std::vector& weights) { + if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine + ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); + auto numWeights = weights.size(); + auto numTrgWords = back().size(); + // word-level weights may or may not contain a weight for EOS tokens + if(numWeights != numTrgWords && numWeights != numTrgWords - 1) + LOG(warn, + "[warn] " + "Number of weights ({}) does not match the number of target words ({}) in line #{}", + numWeights, + numTrgWords, + id_); + } + weights_ = weights; +} + +CorpusIterator::CorpusIterator() : pos_(-1) {} CorpusIterator::CorpusIterator(CorpusBase* corpus) : corpus_(corpus), pos_(0), tup_(corpus_->next()) {} @@ -23,7 +40,7 @@ void CorpusIterator::increment() { } bool CorpusIterator::equal(CorpusIterator const& other) const { - return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty()); + return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); } const SentenceTuple& CorpusIterator::dereference() const { @@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) void CorpusBase::addWordsToSentenceTuple(const std::string& line, size_t batchIndex, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { // This turns a string in to a sequence of numerical word ids. Depending // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece // is used. @@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, } void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used " "together at the moment"); @@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, tup.setAlignment(align); } -void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const { +void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { auto elements = utils::split(line, " "); if(!elements.empty()) { @@ -549,23 +566,6 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr options) { return 0; } -void SentenceTuple::setWeights(const std::vector& weights) { - if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine - ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); - auto numWeights = weights.size(); - auto numTrgWords = back().size(); - // word-level weights may or may not contain a weight for EOS tokens - if(numWeights != numTrgWords && numWeights != numTrgWords - 1) - LOG(warn, - "[warn] " - "Number of weights ({}) does not match the number of target words ({}) in line #{}", - numWeights, - numTrgWords, - id_); - } - weights_ = weights; -} - // experimental: hide inline-fix source tokens from cross attention std::vector SubBatch::crossMaskWithInlineFixSourceSuppressed() const { diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 251df5bc6..82a012862 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -11,6 +11,8 @@ #include "data/rng_engine.h" #include "data/vocab.h" +#include + namespace marian { namespace data { @@ -22,7 +24,7 @@ namespace data { * construction of marian::data::CorpusBatch objects. They are not a part of * marian::data::CorpusBatch. */ -class SentenceTuple { +class SentenceTupleImpl { private: size_t id_; std::vector tuple_; // [stream index][step index] @@ -33,12 +35,17 @@ class SentenceTuple { public: typedef Words value_type; + /** + * @brief Creates an empty tuple with 0 id (default constructor). + */ + SentenceTupleImpl() : id_(0) {} + /** * @brief Creates an empty tuple with the given Id. */ - SentenceTuple(size_t id) : id_(id) {} + SentenceTupleImpl(size_t id) : id_(id) {} - ~SentenceTuple() { tuple_.clear(); } + ~SentenceTupleImpl() {} /** * @brief Returns the sentence's ID. @@ -114,6 +121,92 @@ class SentenceTuple { void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; } }; +class SentenceTuple { +private: + std::shared_ptr> fImpl_; + mutable std::shared_ptr impl_; + +public: + typedef Words value_type; + + /** + * @brief Creates an empty tuple with no associated future. + */ + SentenceTuple() {} + + SentenceTuple(const SentenceTupleImpl& tupImpl) + : impl_(std::make_shared(tupImpl)) {} + + SentenceTuple(std::future&& fImpl) + : fImpl_(new std::future(std::move(fImpl))) {} + + SentenceTupleImpl& get() const { + if(!impl_) { + ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple"); + impl_ = std::make_shared(fImpl_->get()); + } + return *impl_; + } + + /** + * @brief Returns the sentence's ID. + */ + size_t getId() const { return get().getId(); } + + /** + * @brief Returns whether this Tuple was altered or augmented from what + * was provided to Marian in input. + */ + bool isAltered() const { return get().isAltered(); } + + /** + * @brief The size of the tuple, e.g. two for parallel data with a source and + * target sentences. + */ + size_t size() const { return get().size(); } + + /** + * @brief confirms that the tuple has been populated with data + */ + bool valid() const { + return fImpl_ || impl_; + } + + /** + * @brief The i-th tuple sentence. + * + * @param i Tuple's index. + */ + Words& operator[](size_t i) { return get()[i]; } + const Words& operator[](size_t i) const { return get()[i]; } + + /** + * @brief The last tuple sentence, i.e. the target sentence. + */ + Words& back() { return get().back(); } + const Words& back() const { return get().back(); } + + /** + * @brief Checks whether the tuple is empty. + */ + bool empty() const { return get().empty(); } + + auto begin() const -> decltype(get().begin()) { return get().begin(); } + auto end() const -> decltype(get().end()) { return get().end(); } + + auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); } + auto rend() const -> decltype(get().rend()) { return get().rend(); } + + /** + * @brief Get sentence weights. + * + * For sentence-level weights the vector contains only one element. + */ + const std::vector& getWeights() const { return get().getWeights(); } + + const WordAlignment& getAlignment() const { return get().getAlignment(); } +}; + /** * @brief Batch of sentences represented as word indices with masking. */ @@ -586,17 +679,17 @@ class CorpusBase : public DatasetBase batch, const std::vector& batchVector); diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp index d5a48d8df..8029d3516 100644 --- a/src/data/corpus_nbest.cpp +++ b/src/data/corpus_nbest.cpp @@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() { pos_++; // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); std::string line; lastLines_.resize(files_.size() - 1); @@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() { if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + + return SentenceTuple(); } void CorpusNBest::reset() { diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp index 297847c04..f7c577f29 100644 --- a/src/data/corpus_sqlite.cpp +++ b/src/data/corpus_sqlite.cpp @@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() { while(select_->executeStep()) { // fill up the sentence tuple with sentences from all input files size_t curId = select_->getColumn(0).getInt(); - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { auto line = select_->getColumn((int)(i + 1)); @@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() { if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + return SentenceTuple(); } void CorpusSQLite::shuffle() { diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index 958190fce..b1f4cdd47 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -40,7 +40,7 @@ SentenceTuple TextInput::next() { size_t curId = pos_++; // fill up the sentence tuple with source and/or target sentences - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { std::string line; if(io::getline(*files_[i], line)) { @@ -57,9 +57,9 @@ SentenceTuple TextInput::next() { } if(tup.size() == files_.size()) // check if each input file provided an example - return tup; + return SentenceTuple(tup); else if(tup.size() == 0) // if no file provided examples we are done - return SentenceTuple(0); + return SentenceTuple(); else // neither all nor none => we have at least on missing entry ABORT("There are missing entries in the text tuples."); } From bbc673c50fbf2faa90bdc44003d15087632262bc Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 24 Nov 2021 18:42:14 -0800 Subject: [PATCH 127/254] update CHANGELOG and VERSION --- CHANGELOG.md | 6 +++++- VERSION | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05658fe10..bce24cfc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added -- Adds option --add-lsh to marian-conv which allows the LSH to be memory-mapped. +- Parallelized data reading with e.g. `--data-threads 8` +- Top-k sampling during decoding with e.g. `--output-sampling topk 10` +- Improved mixed precision training with `--fp16` +- Set FFN width in decoder independently from encoder with e.g. `--transformer-dim-ffn 4096 --transformer-decoder-dim-ffn 2048` +- Adds option `--add-lsh` to marian-conv which allows the LSH to be memory-mapped. - Early stopping based on first, all, or any validation metrics via `--early-stopping-on` - Compute 8.6 support if using CUDA>=11.1 - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. diff --git a/VERSION b/VERSION index 3c40cf565..cf4bd774c 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.24 +v1.10.42 From c64cb2990ec738478d9c1cc2932040415de5aa6f Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Mon, 6 Dec 2021 14:06:14 +0000 Subject: [PATCH 128/254] Constrain version of mistune to before v2 in GitHub CI Documentation builds (#894) --- doc/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/requirements.txt b/doc/requirements.txt index 8d56e6839..b4b2038aa 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -3,4 +3,5 @@ breathe==4.13.0 exhale sphinx_rtd_theme recommonmark +mistune<2.0.0 m2r From e8ea37cd5b85e3df817b9ced68bef9cc64b45d16 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 6 Dec 2021 23:20:44 +0000 Subject: [PATCH 129/254] Merged PR 21648: Allow for dynamic gradient scaling to fade out after N updates Allow for dynamic gradient scaling to fade out after N updates --- src/tensors/gpu/prod.cpp | 6 +++++- src/training/graph_group.cpp | 17 ++++++++++++++--- src/training/graph_group.h | 1 + 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index bf0d23957..c72af4db9 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C, ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); #if COMPILE_FP16 } else if(C->type() == Type::float16) { // not a *.cu file - ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); + // we use computeType=float here for fp16 training as this seems more stable and roughly as fast + ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); + + // original for reference: + // ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); #endif } else { ABORT("ProdBatchedLegacy not implemented for element type {}", C->type()); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 03e5acf40..59cd4b6d8 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -31,11 +31,16 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) if(vgc.size() > 0) dynamicGradientScalingFactor_ = std::stof(vgc[0]); if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log"; + if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]); LOG_ONCE(info, "Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas", dynamicGradientScalingUseLogs_, dynamicGradientScalingFactor_); + if(dynamicGradientScalingFadeout_ > 0) + LOG_ONCE(info, + "Dynamic gradient re-scaling will fade out linearly after {} updates", + dynamicGradientScalingFadeout_); } if(options_->get("check-gradient-nan")) { @@ -229,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average. auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms. + float fadeoutMultiplier = 1.f; + if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates + fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_; + + float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time. // delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations // hence rescale gradient using the average. - if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) { - LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}", - dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform); + if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) { + if(isMainProcess()) + LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}", + dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm); normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. } diff --git a/src/training/graph_group.h b/src/training/graph_group.h index b7f2f7efc..aa68922ab 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -74,6 +74,7 @@ class GraphGroup { bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; + size_t dynamicGradientScalingFadeout_{0ul}; // determines the number of input streams (i.e. input files or fields in the TSV input) that need // to be included in the batch, i.e. without alignments and weights From cd9afea8d34a8e1507b35ff4a7b7eade2aee4cc7 Mon Sep 17 00:00:00 2001 From: Qianqian Zhu Date: Tue, 7 Dec 2021 15:10:46 +0000 Subject: [PATCH 130/254] Documentation about how to write code documentation (#891) * add initial guidelines of code documentation * fix math formula not displayed in Sphinx * remove @name tags which cannot be extracted by exhale and cause function signature errors * fix markdown ref warning and update markdown parser in sphinx * more about doxygen: add Doxygen commands and math formulas * move code doc guide to a new .rst file * add formula image * Set myst-parser version appropriate for the requested sphinx version * Update documentation on how to write Doxygen comments * Add new section to the documentation index * Sphinx 2.4.4 requires myst-parser 0.14 * complete code doc guide and small fixes on reStructuredText formats * More about reStructuredText * Update badges on the documentation frontpage Co-authored-by: Roman Grundkiewicz --- doc/README.md | 22 +- doc/conf.py | 13 +- doc/doc_guide.rst | 336 +++++++++++++++++++++++++++++++ doc/graph.md | 10 +- doc/images/formula1.png | Bin 0 -> 1012 bytes doc/images/gelu_formula.png | Bin 0 -> 8825 bytes doc/index.rst | 6 +- doc/operators.md | 62 +++--- doc/requirements.txt | 3 +- src/graph/expression_operators.h | 52 ++--- 10 files changed, 424 insertions(+), 80 deletions(-) create mode 100644 doc/doc_guide.rst create mode 100644 doc/images/formula1.png create mode 100644 doc/images/gelu_formula.png diff --git a/doc/README.md b/doc/README.md index 87d86ba1c..efa0b43bd 100644 --- a/doc/README.md +++ b/doc/README.md @@ -23,7 +23,7 @@ Then set up a Python environment and install modules: pip3 install virtualenv virtualenv venv -p python3 source venv/bin/activate - pip install -r requirements.txt + pip3 install -r requirements.txt Documentation building should also work on Windows, but it has not been tested. @@ -48,4 +48,22 @@ Directories: ## Writing documentation -To be documented... +See [this section](src/doc_guide.rst) in the documentation for detailed recommendations on how to +write code and user documentation in Marian. + +In a nutshell, each class, struct or function should have a Doxygen comment following the basic +template of: + + /** + * Brief summary. + * Detailed description. More detail. + * @see Some reference + * @param Parameter description + * @return Return value description + */ + std::string function(int param); + +And attributes should be documented with an inline comment, for example: + + int var; ///< Brief description + diff --git a/doc/conf.py b/doc/conf.py index f05bf6487..b0c68bcdf 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -37,11 +37,11 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.imgmath', + 'sphinx.ext.mathjax', 'sphinx.ext.todo', 'breathe', 'exhale', - 'recommonmark', + 'myst_parser', ] # Add any paths that contain templates here, relative to this directory. @@ -57,6 +57,13 @@ 'README.md', ] +# The file extensions of source files. Sphinx considers the files with +# this suffix as sources. By default, Sphinx only supports 'restructuredtext' +# file type. You can add a new file type using source parser extensions. +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} # -- Options for HTML output ------------------------------------------------- @@ -91,6 +98,7 @@ ENABLE_PREPROCESSING = YES JAVADOC_AUTOBRIEF = YES WARN_IF_UNDOCUMENTED = NO +USE_MATHJAX = YES """ exhale_args = { @@ -100,6 +108,7 @@ 'doxygenStripFromPath' : '..', 'createTreeView' : True, 'exhaleExecutesDoxygen' : True, + # 'verboseBuild' : True, # set True for debugging 'exhaleDoxygenStdin' : doxygen_config.strip(), } diff --git a/doc/doc_guide.rst b/doc/doc_guide.rst new file mode 100644 index 000000000..c4f6b6b73 --- /dev/null +++ b/doc/doc_guide.rst @@ -0,0 +1,336 @@ +Writing documentation +--------------------- + +Marian’s documentation is generated using `Sphinx`_ + `Breathe`_ + `Doxygen`_ + `Exhale`_. +`Doxygen`_ is used for documenting the source code and `Sphinx`_ (together with the extensions of +`Breathe`_ and `Exhale`_) for managing handwritten documentation and generating library API +reference. + +Whenever you add new code or propose changes to Marian, we would highly appreciate if you also add +new Doxygen comments or update existing ones as needed along with your changes (see the `Doxygen +guidelines`_ below). Your Doxygen comments will be integrated in the Marian’s documentation +automatically. + +There is an ongoing and incremental effort with the goal of documenting essential Marian API in a +consistent way. The existing code might not follow these guidelines, but new code should. + + +Code documentation with Doxygen +``````````````````````````````` + +`Doxygen`_ is a powerful documentation system for C++ and many other languages that parses and +extracts documentation comments included in the source code to generate a comprehensive +documentation, for example, in HTML or LaTeX format. + +Doxygen basics +************** + +Doxygen recognises several special comment blocks with some additional markings. In Marian, we +follow the **Javadoc style**, which consist of a C-style comment block starting with two ``*``'s, +like this: + +.. code:: cpp + + /** + * ... text ... + */ + +A documentation comment for all main entities in the code (e.g. classes, functions, methods, etc.) +always includes two sections: a *brief* summary and *detailed* description. In Marian, a Java-style +comment block automatically starts a brief description which ends at the first dot followed by a +space or new line (i.e. there is no need to add the `@brief` keyword). Here is an example: + +.. code:: cpp + + /** + * Brief description which ends at this dot. Details follow + * here. + */ + +If you want to put documentation after members (e.g., a variable and enum), you have to put an +additional ``<`` marker in the comment block. + +.. code:: cpp + + int var; ///< Brief description after the member + +Doxygen commands +**************** + +More details in the documentation can be provided using special Doxygen's special commands +(keywords) which start with an at-sign (@). See `Doxygen special commands`_ for the complete list +of available commands. Here, we list the most common Doxygen commands, which we use to document +Marian: + ++-----------------------+-----------------------+-----------------------+ +| Doxygen Command | Detailed Description | Example | ++=======================+=======================+=======================+ +| @param | Add a parameter | ``@param device a | +| | description for a | pointer to the | +| | function parameter | device`` | ++-----------------------+-----------------------+-----------------------+ +| @return | Add a return value | ``@return a pointer | +| | description for a | to the constant | +| | function | node`` | ++-----------------------+-----------------------+-----------------------+ +| @see | Add a cross-reference | ``@see reshape()`` | +| | to classes, | | +| | functions, methods, | | +| | variables, files or | | +| | URL | | ++-----------------------+-----------------------+-----------------------+ +| @ref | Create a reference to | ``@ref IndexType`` | +| | another item being | | +| | documented. | | ++-----------------------+-----------------------+-----------------------+ +| @copybrief | Copy the brief | ``@copybrief slice`` | +| | description from the | | +| | object specified | | ++-----------------------+-----------------------+-----------------------+ +| @copydetails | Copy the detailed | ``@copydetails dot`` | +| | documentation from | | +| | the object specified | | ++-----------------------+-----------------------+-----------------------+ +| @note | Add a note message | ``@note this is named | +| | where the text will | after an equivalent | +| | be highlighted | function in PyTorch`` | ++-----------------------+-----------------------+-----------------------+ +| @warning | Add a warning message | ``@warning | +| | where the text will | not implemented`` | +| | be highlighted | | ++-----------------------+-----------------------+-----------------------+ +| @b | Display a single word | ``@b bold`` | +| | using a bold font | | ++-----------------------+-----------------------+-----------------------+ +| @c | Display a single word | ``@c void`` | +| | using a typewriter | | +| | font | | ++-----------------------+-----------------------+-----------------------+ +| @p | Display a single word | ``@p transA`` | +| | using a typewriter | | +| | font. Equivalent to | | +| | ``@c`` | | ++-----------------------+-----------------------+-----------------------+ +| @em | Display a single word | ``@em x`` | +| | in italics. | | ++-----------------------+-----------------------+-----------------------+ + +.. warning:: + + Not all Doxygen special commands are supported in Exhale, e.g., `grouping`_ + [`1 `_]. + Some commands like `@name`_ could lead to errors when parsing overloaded functions. + To free yourself from debugging the Doxygen comments for hours, we recommend you only using the + above commands. + +Math formulas in Doxygen +************************ + +Doxygen supports LaTeX math formulas in the documentation. To include an inline formula that appears +in the running text, we need wrap it by a pair of ``@f$`` commands, for example: + +.. code:: none + + Default is no smoothing, @f$\alpha = 0 @f$. + +This will result in: Default is no smoothing, |formula1| + +.. |formula1| image:: images/formula1.png + +For the longer formulas which are in separate lines, we can put ``\f[`` and ``\f]`` commands between +the formulas, for instance: + +.. code:: none + + @f[ + \operatorname{gelu}(x) = x \cdot \Phi(x) + = x \cdot \frac{1}{2}\left[ + 1 + \operatorname{erf}\left(\frac{x}{\sqrt{2}}\right) + \right] + \sim \operatorname{swish}(x, 1.702) + @f] + +This will result in: + +.. figure:: images/gelu_formula.png + :alt: Example of formula 2 + + Example of formula 2 + +.. note:: + + Make sure the formula contains *valid* commands in `LaTeX’s math-mode`_. + +Recommendations +*************** + +First of all, add Doxygen comments in the header files. You can find the examples of Doxygen +comments in `src/graph/expression_graph.h`_. A good practice is to keep Doxygen comments as +intuitive and short as possible. Try not to introduce unnecessary vertical space (e.g., an empty +line). A basic template of Doxygen comments is shown as follows: + +.. code:: cpp + + /** + * Brief summary. + * Detailed description. More detail. + * @see Some reference + * @param Parameter description. + * @return Return value description. + */ + + +User documentation with Sphinx +`````````````````````````````` + +Sphinx supports `reStructuredText`_ and `Markdown`_ documents. Marian's user documentation files are +located in `doc`_. The default format of Sphinx is `reStructuredText`_ and most of the framework's +power comes from the richness of its default `reStructuredText`_ markup format. + + +reStructuredText +**************** + +As Marian’s documentation is generated using `Sphinx`_ + `Breathe`_ + `Doxygen`_ + `Exhale`_, +reStructuredText is the best language to use if you need to utilise many ``directives`` generated by +Sphinx / Breathe / Exhale and are not satisfied with Markdown features as mentioned :ref:`below +`. + +There are many useful ``directives`` supported by Sphinx / Breathe / Exhale which you could use in +your user documentation. Here we highlight the most useful directives when documenting Marian. +For the complete reStructuredText syntax guide, please refer to the `mini guide`_ provided by +`Exhale`_. Sphinx docs also covers the most important aspects of reStructuredText. Read more in the +`reStructuredText tutorials provided Sphinx`_. + +The first useful set of directives are `Breathe directives`_ which are used to include documentation +for different constructs. The available commands are listed below: + + .. code:: none + + .. doxygenindex:: + .. doxygenfunction:: + .. doxygenstruct:: + .. doxygenenum:: + .. doxygentypedef:: + .. doxygenclass:: + +The second one is `Exhale directives`_ which are used to link different constructs. The available +commands are listed below: + + .. code:: none + + :class:`namespace::ClassName` + :func:`namespace::ClassName::methodName` + :member:`namespace::ClassName::mMemberName` + :func:`namespace::funcName` + +.. tip:: + 1. reStructuredText is particularly sensitive to whitespace! If the rendered text does not turn + out as what you expected, double check space(s) or newline(s). + 2. It takes several minutes to build Marian's documentation (mostly due to Exhale). If you work + on a user documentation and need to check the rendered result frequently, you can comment out + the exhale extension in ``conf.py`` file once :doc:`Marian code documentation + ` is generated (i.e., building the whole documentation once). This will + greatly speed up the documentation building. + +.. _md-section: + +Markdown +******** + +Although reStructuredText is more powerful than Markdown, it might feel less intuitive if you have +never used it before. Sphinx docs now use `MyST-Parser`_ as a default extension for handling +Markdown, which adds more Markdown-friendly syntax for the purpose of the documentation, in addition +to the `CommonMark`_ features. Read more in the `MyST-Parser documentation`_. + +For instance, MyST-Parser supports `directives syntax`_, a generic block of explicit markup syntax +available in reStructuredText, such as ``note admonitions``: + + .. code:: none + + ```{note} Notes require **no** arguments, so content can start here. + ``` + +The above markdown text will be rendered as below: + + .. note:: + + Notes require **no** arguments, so content can start here. + +Another useful feature is that you can include reStructuredText text/files into a Markdown file. +This means you can take advantage of ``directives`` generated by Sphinx / Breathe / Exhale with +ease, especially if you want to highlight/reference the functions or classes in :doc:`Marian code +documentation `. +In general Sphinx docs only supports reStructuredText commands (such as `sphinx.ext.autodoc`_ and +`Breathe directives`_) to interact with the code documentation [`2 `_]. + +For example, let's assume that you want to include the function documentation of +``marian::inits::fromValue ( float )`` in the user documentation. You can use the following `Breathe +doxygenfunction directive`_ for this: + +.. doxygenfunction:: marian::inits::fromValue(float) + +To display the exactly same content as above, MyST-Parser offers the special `eval-rst directive`_ +to wrap reStructuredText directives: + + .. code:: none + + ```{eval-rst} + .. doxygenfunction:: marian::inits::fromValue(float) + ``` + +Also, you can link functions or classes in :doc:`Marian code documentation ` with +`eval-rst directive`_. For example, to link ``marian::inits::fromValue(float)`` you can use the +following markdown syntax: + + .. code:: none + + ```{eval-rst} + Link to :func:`marian::inits::fromValue` + ``` + +Or you can directly link to the function in `markdown hyperlink syntax`_: + + .. code:: none + + Link to [`marian::inits::fromValue(float)`](api/function_namespacemarian_1_1inits_1a71bb6dee3704c85c5f63a97eead43a1e.html#_CPPv4N6marian5inits9fromValueEf) + +Both outputs will be rendered with a clickable hyperlink to ``marian::inits::fromValue(float)`` in +the corresponding Library API page (as shown below): + + Link to :func:`marian::inits::fromValue` + +.. note:: + + The reference links for ``marian::inits::fromValue(float)`` is generated by `Exhale`_. For more + information about how to cross-reference the code documentation, see `Exhale's linking + strategy`_. + + +.. _Sphinx: https://www.sphinx-doc.org/en/master/usage/quickstart.html +.. _Breathe: https://breathe.readthedocs.io/en/latest/directives.html +.. _Doxygen: http://www.doxygen.nl/manual/docblocks.html +.. _Exhale: https://exhale.readthedocs.io/en/latest/usage.html +.. _Doxygen guidelines: #documentation-with-doxygen +.. _JAVADOC_AUTOBRIEF: https://www.doxygen.nl/manual/config.html#cfg_javadoc_autobrief +.. _Doxygen special commands: https://www.doxygen.nl/manual/commands.html +.. _grouping: https://www.doxygen.nl/manual/grouping.html +.. _@name: https://www.doxygen.nl/manual/commands.html#cmdname +.. _LaTeX’s math-mode: https://en.wikibooks.org/wiki/LaTeX/Mathematics +.. _src/graph/expression_graph.h: https://github.com/marian-nmt/marian-dev/blob/master/src/graph/expression_graph.h +.. _Markdown: https://www.sphinx-doc.org/en/master/usage/markdown.html +.. _reStructuredText: https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html +.. _doc: https://github.com/marian-nmt/marian-dev/tree/master/doc +.. _MyST-Parser: https://www.sphinx-doc.org/en/master/usage/markdown.html +.. _MyST-Parser documentation: https://myst-parser.readthedocs.io/en/latest/syntax/optional.html +.. _CommonMark: https://commonmark.org +.. _directives syntax: https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html#directives-a-block-level-extension-point +.. _Breathe directives: https://breathe.readthedocs.io/en/latest/directives.html +.. _Breathe doxygenfunction directive: https://breathe.readthedocs.io/en/latest/directives.html#doxygenfunction +.. _sphinx.ext.autodoc: https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#module-sphinx.ext.autodoc +.. _eval-rst directive: https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html#syntax-directives-parsing +.. _Exhale's linking strategy: https://exhale.readthedocs.io/en/latest/usage.html#linking-to-a-generated-file +.. _mini guide: https://exhale.readthedocs.io/en/latest/mastering_doxygen.html#features-available-by-using-sphinx-breathe-exhale-by-way-of-restructuredtext +.. _reStructuredText tutorials provided Sphinx: https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html +.. _markdown hyperlink syntax: https://www.markdownguide.org/basic-syntax/#links +.. _Exhale directives: https://exhale.readthedocs.io/en/latest/usage.html#suggested-restructuredtext-linking-strategy diff --git a/doc/graph.md b/doc/graph.md index 3ddeba909..f939c1759 100644 --- a/doc/graph.md +++ b/doc/graph.md @@ -6,7 +6,7 @@ The dynamic declaration, which means a new graph is created for each training in It allows handling of variably sized inputs, as well as the cases where the graph may change depending on the results of previous steps. Compared to static declaration, a dynamic computation graph could be expensive in terms of creating and optimising computation graphs. Marian uses careful memory management to remove overhead in computation graph construction, and supports efficient execution on both CPU and GPU. -The main implementation of computation graph is in under [`src/graph`](api/dir_src_graph.html#dir-src-graph) directory. +The main implementation of computation graph is in under [src/graph](dir_src_graph) directory. Building blocks for graphs: @@ -59,7 +59,7 @@ The _workspace memory_ means the size of the memory available for the forward an This does not include model size and optimizer parameters that are allocated outsize workspace. Hence you cannot allocate all device memory to the workspace. -To create a graph, Marian offer a set of shortcut functions that implements the common expression operators for a neural network (see [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html)), such as `affine()`. +To create a graph, Marian offers a set of shortcut functions that implements the common expression operators for a neural network (see [src/graph/expression_operators.h](file_src_graph_expression_operators.h), such as `affine()`. These functions actually construct the corresponding operation nodes in the graph, make links with other nodes. E.g., `affine()` construct a `AffineNodeOp` node in the graph. Thus, building a graph turns into a simple task of defining expressions by using those functions. @@ -142,7 +142,7 @@ auto x = graph->constant({N, NUM_FEATURES}, inits::fromVector(inputData)); For the above example, the shape of the constant node is `{N, NUM_FEATURES}`, and the value of the constant node is initialised from a vector `inputData`. `inits::fromVector()` returns a `NodeInitializer` which is a functor used to initialise a tensor by copying from the given vector. -More functions used to initialise a node can be found in [`src/graph/node_initializers.h`](api/namespace_marian__inits.html#namespace-marian-inits) file. +More functions used to initialise a node can be found in [src/graph/node_initializers.h](namespace_marian__inits) file. Marian also provides some shortcut functions to construct special constant nodes, such as `ones()` and `zeros()`: ```cpp @@ -186,7 +186,7 @@ auto h = tanh(affine(x, W1, b1)); ``` In the above example, `affine()` and `tanh()` actually add `AffineNodeOp` and `TanhNodeOp` nodes to the graph. -For more shortcut functions used to add operations in the graph, you can find in [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file. +For more shortcut functions used to add operations in the graph, you can find in [src/graph/expression_operators.h](file_src_graph_expression_operators.h) file. ## Graph execution @@ -279,7 +279,7 @@ This comes to how we define the loss function and optimiser for the graph. A loss function is used to calculate the model error between the predicted value and the actual value. The goal is to minimise this error during training. In a graph, the loss function is also represented as a group of node(s). -You can also use the operators provided in [`expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file to define the loss function. +You can also use the operators provided in [src/graph/expression_operators.h](file_src_graph_expression_operators.h) file to define the loss function. E.g., Marian offers `cross_entropy()` function to compute the cross-entropy loss between true labels and predicted labels. **Define a loss function for modified Example 1** diff --git a/doc/images/formula1.png b/doc/images/formula1.png new file mode 100644 index 0000000000000000000000000000000000000000..60eee254c866935f7e4790a994e2a19bbd781755 GIT binary patch literal 1012 zcmVZgXgFbngSdJ^%m!Ds)9ybVG7w zVRUJ4ZXi@?ZDjycb#5RqH6TP~V<0jxGBF@AHaamiIx;aJM@>|DrUQ}y00U4-L_t(Y ziS3qcNR?+8$A92Ms}bQO)w4#60U)}jv^%UCh`&|-{Y zW0kc*V__6~aRmytrOd{%7l%zGf|gX+2P@y2<<4e#jz^#KJoi56*70y$4i06p|EGuR zzMlKK{`d7N!r`#wF@W;09f%hm8zS1nc3@fcb$NN`WA7BF{}mcq%nY`$Bp@_2ExK#TA85~m_tZ$dD*ZtUNr(*{z~5P zHnZg*(oFlLGt|P@x5F` z+SM99{Hzt3HII@t8}Sv~Cp7WIXb&x4AK<&oV8Bi*@xl(a~@)Y2&}&CXU=t=YtFgvJ5EbO2^WhJ3k3xQSNWa14hqWi;-|JU2HMjZbD6vJ z>GHx`R#^|@=>THb#y;Is_$V0p=(^kc_*;3|p*Xm?yV`Mj+j!a8xp_Of`y8S6NTQ(7 zqbSSE=mq2+EJ`_2&3Esd96`j@*WcuqH&9lmP_}~rPA%COc%-HrheP&sF2MXg2W((# zuh4U%bpj$HDfC+rjC5zMe3_h&ZGCHKRD!&w#CWVpzgZ{v4H!w>7ZT0ih7Pq+xC+;O zu^9_nmx{L&>%lLHWlmD0oXB3j`{Q17fgg5v0$qM3i@^{@P=x)^FfT?QMeuYNYD$=h z_4G_Owl>95@84TlpQIPX|Lf<>L{9Qw%Z#zF;$QEm9ZF2a=zlHw*>^l_|IN^$7@_@d z7{-*q|MgT`J-)%{PX@yF`*FfTZK!Wejq;91`9X zJW%6$y6>B@x_NzwR)nJRu_53KSzV?Fc#^1n9Ix_~{i7UI;;PPM@`U4F!e0l&)1)NF z9K%zo9lnKEn#2r%lCjPi0^5sL&jL?^@E;ypmlo`_C}?$nkTWy2kl*NqzX@(dEN}10V;Rf>KVC0(+t6 zSbd5}e2Ts6J)8GTFfPO30K=2(*YJKTf2&D(pKGUQ*a_i7X1AYYQx}WVocIPC{w&}j z2vZ3VQtx0|tsszMtE3>2jg1+d+wlDfXe3-OE37bKPaVmCFV3Ss@@Re{X=!>n4hc!` zD<6jfXZHjR(a7^KctImtcoRgK!T1cJ!m0lzKp!d0ov^`E<0)LO4_;zx%=RYFwqR(! zWPccC)8%`47x!Y7HevGG)kz56MGCUiV^5oyY16xYGF9fkIHG;sBz;H}#qxT=Wjr;Z zLBF4lu2A1=ZqPS}^#z!34kVS>rWM2f|K51qaoGXPb|C7`nf3XA3Q)60CLqTi8?T zop?~72591x_CAFHO|)KQVo%~5x^ESlzzrsvn7C3pdw>=-TPfDW z62Yo8ON)cf*K1H%KD(uMvGu-;G)ur2Hs8??znRO4+@q>uD31AARsX+%gA(+CQ>xz8 zdWnF1%UH{&v4+>S`-MReOBdaiURd$>j)Ma%P{ioF5;?QW_g|DG9m&yjMA!zC%zie@BPr*K!lhFI(HYeQx&wX}y-&{=KG|~kNqjJ>M^u-{)ZI())wIhP{-=eFUm)B&s$}KoH82M*VtZ>F0)M4L24*-u#I!$ z>}tg^3~~7=jqR)|2hWBumi7(@@}B$>tQa~w)R1TJQ$t%zpG04?UXV)C!M0&c!#=#G zn}x%o{dQW_M|OLtnt#qK^;aaAd;$7ymxgiq*3}dpJ>$071(KjYk}_h5XmZ8SI0jzs z0GsxFDbERTe0FiBF<1;STJP=XNp^p=XztaGWsIh;VlVeW@|6}*}rMj z=y;l%GLN3#4sN1VFP0?r@RC8bwYX=@DN3N;6-}BHxK6C`UOy+&6ZA(@McVk5mDW$> zBjD1vIl}wIymu{TL*HG9IX_C?VV}TzSNR>f;jAH(<|zke07;KJ%8%&Eyv6w8gs-m; zC6s9{-2=)^#vt~cQW|U7m3W-FmU<(n9hQqON{9S(4H?L;u*dLbOu7mCwD7@T$*!=; zPH$9gk0hMBBya#PX`A_vQ7UE;+Z1!o5z!@~4LCsY?byPGaCOFQ2CWy{2m4H%a~5uj z!`6z>g`(e=k?@-AkI9yg_g%d~A@}Evnv1|;*I{GR#B<@WhcjVruC#kY+41gRP2Qcx zBl8C#AWZOq*A!Z0@a9WCg#(+j>7hc)klO&g)EU{sCio#P>;?wD(w*S9q9i%it~yN2FU3X1%2BUJ2AQuCz#ow}rhWY^NH-&rd-QCAd%WZe=adjN zWNKsOy)6zTpRtFrX8S@o-LO2MCA-*&%ZA_WRN_6c&}b{r#!`1QtNy~uS~A0fcgSm3 zMEhXJ)7u|k91?KkY|zk3hg-kfxww>OQub^9sI*mSpxUhEWw&+ba*E)CwZX|t81{$L zQeG09qtlrs1Dfjctk52?B~NVa-cB-Y;cf8AIWA%Hb+5p%T~7fn9^ED0n{ca7sDNnG zb`r-9lGAH$J*m*L!{?4WCoF?ER{mnDg^=kHG8?!B!)-wt3o_CNmLO;Ffr^o5a9=$b z`CJYskYyxmEsEA!Do3~26kc>(+SfhfG+1^0(!M2lCyd&LpRzx=v2+<>KZcXi_E#ET z!GU%El*7Wx`s9w|n{cv9fFERL)6}@=?ixVGkketcMu)xZ5nWvE;gJhashh(Be19ej zU0Nu!dRFB(se(V}NC}aKSlJA4hPEzlQyH%h_7eNB5 zwT;kF*yP~ZU|a%nMp!I|gVPo<<6vz5xq!IrYRn{UWrvr&kdg{}^6PlaJFZD0SJm)8 zl7{t>#SXKMB4{}vb+#mlO9@e;5h%Q!gE<&nCO8g5tY5T-f5B5_u5$bOs|Jml%l?gO ztxyW=V_ou+$wJ;|9f9w(y-WA25mS5Z z`y#U=g575e-5AAv>)MU&pslnA_A&H>c-O;5^a>U(CtCezdm30oJ@?@L?IY8n6LrRt zDEwBvfbJ-cNDlMU@x1Y88skP8+(}=0fiA^#b)x?~KKKt4u@d zE0lRSQ|Ac?jTY|yWFmGPyjP?zU;Si=_-}kdk)wRpatb*Lqr}ezzeoTrt;Azq#&DnV z!JoHkFMEy>P3)_Ooqk&RghqdCl(PqZs%+Ho;*<#s`Xu!9qi$FqI_oZ=i$~Kh0`?T* zv9yze`0XS{M!$a2&0XIHDFYiwGNGg6<0SBJY6S7-eByrWwxmm^LsNm-u`9YOM&{}? zmix2hbdLg>HyPlS8+adXn|iZT;q~pLuz`S}A$CPvph1_G@ePP9jk*8KS?w#f_${aA z2mXE|6U*vz^pkcIazKj!}`v{B^OXog%oyPf^~%ZQD1gK*vT5$U2L()-Pnb_79}WgQr_<~O5Mjt zd}vL5_CEcpLV?S9rS0Pyb4%M#OS|~L69(8tyUGs^-}@g&$>FuC^ben>W$IF!nv?++ zeKHE#rl#fLR6!%7!zV34Vdd|Fs>cuAXkZtF%Hk|jq1%hAg59v`@Zl3@1=mO+u$;Iu z%?w=ZGcIfS+R>jd8p`1Bjrs9E8dT7$o4-Urr3P0<1a%KsR+`^dk>ZFiShf1+y(l)@ z)-ukss-(1XmC%s(tl&^TU4!xLP5QR9yr)p>Jddb6r!97VV;_PAXEs-pv5?@x>(OJVW@v<^0Z-VUZ^13mL8XppgxmCQaXEw(yn8(_*`iH8+EzKSV zU1y+~9kybB_9;TBO@5Y*JXl(D`T8PY$pA9%c8{&^fCI*|)E1Hn2HpsZY;q=QMJUug zZ)>Yn)y3wcHgF_XUw9zD8Oi!U2{HPLn-v-91%M7WrdR*&+JK&&h^XcAS z_%U3Y%0+Ipi!wE3@zs&aYVjv08SAx;{A8l0OMJB|Fs_TySF3(c&j2*%6aH+?`G{iS z>snn;-=J6OSA6S2^|Hs*QkHf_Swm?HhoO9?`N!00Ba~>?qdO|K`-|3=yoxX8SFuvf zp&oK`H;^uob}@~(eJH`$S%PDmo&hwlC7by!^s+*zY{2ynY1eRVpR*L_4_%s-Xw%CT z8hzNm(aTcdijnzZk^kf86*XlDrBNqtK(BZ2f-hmBUyidDhX6gbxFd$%T}Sd(c{0#v zIYsSvc0c18g;JQtxW5OblT}e6KeoyIw6*7Fx69Aq&sB4g+k+--61#uX5QqVpQl#Fu zvR_iU8}-P+D(&USkB3Uy{Hb&gd;60!#YNWZ!lvYjc!?aTyLw6KMQO3Y_`I@Q7f!fA zj5t@ld#WeF`$c4}WzM4q(ICw{KKCMS`Fj6^ALghqwEJ(9cFi565)VpO_^l1e>#~y+ z3P2T)h>{y$Ie;Q04q8VfZD)Dn|Jor`gG4|2Q^-kH^{%IZ4jFKReJYk|EvC6%W3hJ- zx?X_QEqwIA9@^(@=GVyA#r6iLNA0aw0%Ae6eW=0GV3jk+mzz;j4qZuuf#-_cLK14v z98PPX6~wP6O+zh?Sgo4dj2Mf|d6B&R>_u@~jSZ|1-`chNwT;Bq-y06|a+QaP z=X^6vl?a$`Pg#3{m)ont2{#+MCcTpVWI`;*iW?r!ZAbr*6q$$~$fhpBc6=LkcBoLf zMz1eUQL6#|;RS{I_la6h-r@rz{!0$ZdFJy~0#x&iUaefKTY1!(^SAu3B=6KOE}V}& z$zUIx%S9J|So{(A$vr!qB6QEBIW6!8ol3l28M!jmR9(PZGDbPTK9-mlcvf3<=^?DX zd@tdCD1@olsV?FeFh6I8+~cRQn8bn=bf8bS8U;<|PfcI{H6bzqU#u91Ik#vW>t2+8 z{A0jzsE6xx&PRBs$ei^Xbpcpa$*v7?MK%Kzed#o{dKTFD4{0xgA|;@MULu`e%y3hH z*Vw_>BBvn28<&kM!@v9WCoLamkVGTfj7mZnwe!2RV(STp{6|v?ziM?{r7iidCeTjk zlY>B-e;C*fY|wTyM4yTx)GU>(Hh#4~PBph*>-3{)4F>`tJL+(+=zsd_C{e97x zKMunWUiBidk-S(xf*g>tYjcL+E6cKU?dGKSWpkl|;3Vg@)}Y9p z$c(D?G#uCU$Y|2_{USISgike`ehQ=pj?HV@jlZQkgmRxtU|CZrjA}$x==SX-@px%F zB#~NK2jrC27~aj32Hf05y(VG;XmkyGceQSf4=n!`d`PWbQW4P;9NQ0MIR?WGn>A6t z!Ki%Aq!lDU7tQhCYAK_(bjdXAE(C^278&~-XQQ8rY@JaO)@;#Nr9UrO(z>BdIAM@X zn%pn(l6`CYLQbbYMZgR065I4?r!{|fcmLI&8287bdzNBYb0gIao%pN{3(?DjohrU$ z&*hZD^PK0~vipkhjC|M>OF8k}pFLTsbsSFz9Dm_Vf-^7u;bs^?*(p+WP{2O6C? zj&7w^ncE-f0<9#bEuG`cj+QRb)0MrpG2^*?E2HFD>X&TjjKSz>35mdBrQ?N#Ejzy} ztTPS2b+MkW!9GOuBL&k{asjb6Hv8!)jALWR*5QB2NSoqVLx)Pze_ASJa_?tfesVAl zlSr>6T2&aizC2q9Od(tMV z-A8RJf1%5v8tnpt=I(_69VUDyhXb!n9g91?qr|<4qWmcvR<|ec%fEN2RyKft(9JDF zpFcuw?3=>;?pI(kCUSdZ*7@uA*+V#aso4!9R7t6&RRDcrvH|Ih?D=Ee{8H&X9i<&* zdHi=ZEIvM438HUxFDl8OKnu*!gIdmV<#|j}O^wy>?l#4t-kV8$d>UlJV(UKj+!_2Z zRg&`||G@(+v2#L+{a{{%>7u>mThE6o#%(lRmWpSuPmEuGY!(@^So(vh(Phi*wG?xp z04rFfc9%9;oP~+AU-I3$kv5x_B8utEUhKcFHcmpM5&i8H{5%GdaS|&}wR;nGovnTI zDL^tTZby6ZMKHF>5Rql!ztg9Lk`7rY=hR$HAYRo1j? zI4*(`j^~?k1QjUS2PJ4ZWoq$Dvx+KqW!gD)L|6@7cU)zV(%RCWWgoN}e}8xj=(>lC zmPl>EWf=)?f3SQP1dTX$nI;loM#UyRefPwM0f_Qu294OaBdO=RH=76zm7xTqh^0a0 zA?heapB0a61(I>5<7G3BRgt zTa-@2C-z^CwFU>^B3aJ0q6+4)R$Rtp&O#;BQ4;|%tdpC8kisVMKa?`=Sxqj}F-D2h zZoA(lxy{*xZ_%>=vnBP$eWl8<55eJE#TE?>8hG&y)j-}$t(dO=0XbnFDfR5OSZs`E zjQz@5kcVYM}8T-oB|JSW>gy6n*cC&=2g&LSc3q%7sG%)pRN> zhB}p+s}xEC;Wn6uU!zOgTYGo^sDY$*t~P#rytRWU*Y`{^UBc>sw6c9L^L}YY-$?rL z&rkUNZ(sy~lo#aIaorttO~|lRf*m z(&oPvw$@dY-JCQF#k`M|Ssd8o?u-ikV8=V4op)aCIvV+g(BW?XGt#{Cau#Pzc}eczS8NQSIKDmi>x-&zTX5 z{zgE}MMHCW`euaz0Prb0HPKUaz^q(3+(1-aQy7bNd=SFM=E!h)Zjjw>{gjdp`OJwew8qS2Ov$=AD_?vk`&B|^fZcV5(HuIl#2J}3mFbnl1+cMDKqjJM3jG>~d?v1xgiD%v- zeRC=}b%gTxvfWOyZ8i9E*l)$pV>zAEd>95{7j}}6k+*z?B}ylR>bVl|a&rOgh~ie! z=X^Z;qne#0qagkzIHmK`&RrVknO&EefquO==-DX+5pR*g=X~p3(UYN(DfzbI(QNei?M0qA6w9?18obRi-B!9i@Cv0>bk z2_j?)&&V=Xv*L)nADyC#ML}kCk?0@P1c(`@n-WMR%&FYmex7hx3|$7$iw)TdjLb#J z9)0|*V`}ZDE7LcmA8PF>7_pmaoHwe~Vk(|)oR`Z)t@*1b&%WAz{Y2U=D0gz8X6nYL zB&oGBl*5W#{aX#g>o{1uacizUn{J<{3w1L3sA^)ebHyCJ=IoF62-BJ5o*uFr$TqEw zzSyn7AXTt(JdQh9y$JzBb;POR;5%)u@lmnmMY~n!>CQD8s^*qo9s$hM>bBye*~Hgg z$4EU~tsgBe#nENA_f4NW$RQUjlpfhvEBqejee>YaY0M zTjv}ze)Zh%RN|~vjQw`e&Xrf<0#xW<*-}bXH&EKhukk)O(77kU&9jELT`gZ_vEPdX z*oRSkg4y5g{#f=zjI4vvuD{&mCvcfU?9Xrc5MS@udqm_)B!Q14cHjE3m+ruipSK?$ zdJ@1p32&+J%?^#=3ic0o*?CHn*Y$)Vj{?m&X_M5cboNJAqat`MHPqEvelAoBoj-cN zQ|-Q^I-0IXZylz3rL#;q-~kcpHV9r-wo{~*(P7cYv>kyO(f=2X^5 zE>lcFp;nu%2xdWcbH!OYKl*qx>kHrLVHmMzU4(O{QOIR{^j|j3KRUU(g21sZaa!#| z_)9gpy|a&YR$vb8Lw^658cp*Ds8_^it}fsD)6`uFhx%2BcZd1=1D$8SN^=1M(!CO%{pQENmMbl&N85^1et8ledgc$zn0EmU~iI$u)^ zO`-TgRS)!|Be}R6vnurcM8-~iUW&X=`D~x z<^K~@Hi03TmKU6SFuw7LiMK+0Lu$2snhAU3G44pFss1K$T%c{UrTO{{SMxrVp>-Uk8MB@&`|1y%3-0NY$v)HRJ3;| z803ggR9c^_AM^T&*}X)jP^z3REDrK8c$Jd=>aTeDl-+LLY(krXkIp zRd_YQ$R(W2T{Py3^>seq^+D+jJTH|%C#j)0x;s)6kPG^3J!UWj9kfrFVF(`C8kOIk zH0n54 ztcEP59_Se#0*qJ`Xuz=Gz11725VE585uPl)qOCgc=i8mVqU41Ccg(0M@;cR)FT_G{&r}mS@+rPO#7hKs9kFFp0TFU zbjt$fYk@`a;J)gadJKj`k$-Hi|JN~%EfSHn5>gu}d58-O>ka*CcL*EvdORP#@V<-v zn|!{Op_+|rYVysT$mF_=k(M`MRjZhIrzmJn*bKGOPlVWDo?VtFKPrPt1hN zH5%;q9V;3+?P-(68=ZC+Lx@RnJ^#e`yg0}d1mkXEbrZ$G3v;%e65vA^#3hSi6|Po{ zU$YM4|09zJ7i~%3xd#TGdhI(7*6D9HU4(3Vg|Pysv}v^g799)nMg~LD6d}r|DKYn6 z`DxtVN#bB&u{>8IR~>ky*415i=5^x2sgIv*>|xqJN)Rbb@*A6?N+W6EH?=n(-?*A4 zpFrBc<>kn=q0BR_c#K17taOrK^pw$18XJXp*tt{zL5 zGS0^q)#m*?4td8pbpQ!KnI?!s@aR^5P$?X8_h|^*WE8w1x_%06GWFMtY@1ypnsI~I z##4157X_A9e)I`|`V;G>zVL2$O|3M zC3r@$bYG{R44?(f{%HI*5Y__wi@9?2Y6v=N=YK+or>*(_NYL>Aqm8(J?0+4;wF+@Hd(w`eC@W~l*UDN({2$o}Tp<7e literal 0 HcmV?d00001 diff --git a/doc/index.rst b/doc/index.rst index ec3802b49..d0a4fefb4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -19,6 +19,8 @@ This is developer documentation. User documentation is available at https://mari contributing + doc_guide + Indices and tables ------------------ @@ -26,8 +28,8 @@ Indices and tables * :ref:`genindex` -.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDAC%20Build - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev/ +.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDAC%20Build + :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/ :alt: GPU build status .. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU%20Build diff --git a/doc/operators.md b/doc/operators.md index e08d20ca4..2cca391b7 100644 --- a/doc/operators.md +++ b/doc/operators.md @@ -25,7 +25,7 @@ Marian. The central component in the graph is the `Chainable` object. This object provides the abstract interface necessary to interact with elements in the computation graph. The details of this interface can be found in -[/src/graph/chainable.h](api/file_src_graph_chainable.h.html). Note that the +[/src/graph/chainable.h](file_src_graph_chainable.h). Note that the template parameter corresponds to the underlying data structure, which in Marian is the `Tensor`. Therefore, for convenience, the type `Expr` is defined: @@ -37,22 +37,22 @@ The implementation of the different operator components are divided across several files: - Expression Operator - - [/src/graph/expression_operators.h](api/file_src_graph_expression_operators.h.html) - - [/src/graph/expression_operators.cpp](api/file_src_graph_expression_operators.cpp.html) + - [/src/graph/expression_operators.h](file_src_graph_expression_operators.h) + - [/src/graph/expression_operators.cpp](file_src_graph_expression_operators.cpp) - Node Operator - - [/src/graph/node_operators_unary.h](api/file_src_graph_node_operators_unary.h.html) - - [/src/graph/node_operators_binary.h](api/file_src_graph_node_operators_binary.h.html) - - [/src/graph/node_operators_tuple.h](api/file_src_graph_node_operators_tuple.h.html) + - [/src/graph/node_operators_unary.h](file_src_graph_node_operators_unary.h) + - [/src/graph/node_operators_binary.h](file_src_graph_node_operators_binary.h) + - [/src/graph/node_operators_tuple.h](file_src_graph_node_operators_tuple.h) - Functional Operator - - [/src/functional/operators.h](api/file_src_functional_operators.h.html) + - [/src/functional/operators.h](file_src_functional_operators.h) - Tensor operation - - [/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html) - - [/src/tensors/cpu/tensor_operators.cpp](api/file_src_tensors_cpu_tensor_operators.cpp.html) - - [/src/tensors/gpu/tensor_operators.cu](api/file_src_tensors_gpu_tensor_operators.cu.html) + - [/src/tensors/tensor_operators.h](file_src_tensors_tensor_operators.h) + - [/src/tensors/cpu/tensor_operators.cpp](file_src_tensors_cpu_tensor_operators.cpp) + - [/src/tensors/gpu/tensor_operators.cu](file_src_tensors_gpu_tensor_operators.cu) - Declared Specialization - - [/src/tensors/gpu/element.inc](api/program_listing_file_src_tensors_gpu_element.inc.html) - - [/src/tensors/gpu/add.inc](api/program_listing_file_src_tensors_gpu_add.inc.html) - - [/src/tensors/gpu/add_all.inc](api/program_listing_file_src_tensors_gpu_add_all.inc.html) + - [/src/tensors/gpu/element.inc](program_listing_file_src_tensors_gpu_element.inc) + - [/src/tensors/gpu/add.inc](program_listing_file_src_tensors_gpu_add.inc) + - [/src/tensors/gpu/add_all.inc](program_listing_file_src_tensors_gpu_add_all.inc) To understand how the different components are inter-linked, we'll look at each of them in turn. @@ -197,7 +197,7 @@ this example code, these are optional and, when omitted, calling `NaryNodeOp({a})` would result in a node with the same shape and type as `a`. The `type()` method returns the friendly name for the node. Note that the [ONNX](https://onnx.ai) -[interface](api/program_listing_file_src_onnx_expression_graph_onnx_serialization.cpp.html) +[interface](program_listing_file_src_onnx_expression_graph_onnx_serialization.cpp) maintains a mapping of these friendly names to their ONNX representation. In the absence of any member variables the `hash()` and `equal()` methods can be omitted, and defer to their `NaryNodeOp` definition. However, if such variables @@ -244,7 +244,7 @@ _1 = sin(_2) ``` The placeholders `_1`, `_2` are enabled by code in -[/src/functional](api/dir_src_functional.html) and interoperate with the +[/src/functional](dir_src_functional) and interoperate with the functional operators. In the call to `Element`, `val_` is assigned to `_1` and `child(0)->val()` to `_2`. Therefore, this has the action of setting the elements of this node to the result obtained by applying `sin` to the elements @@ -328,7 +328,7 @@ specialization required for each type. The current required types are: - half (see `cuda_fp16.h` in the CUDA Math API) Further details are available in -[/src/common/types.h](api/file_src_common_types.h.html). +[/src/common/types.h](file_src_common_types.h). Returning to the example of `sin(x)`, the specialization for `float` and `double` requires @@ -355,12 +355,12 @@ struct Ops { ``` The remaining specializations can be seen in -[/src/functional/operators.h](api/file_src_functional_operators.h.html). Note +[/src/functional/operators.h](file_src_functional_operators.h). Note that the general template must produce a runtime abort. The final component of the functional operator is to call the macro that enables interoperability with the framework of -[/src/functional](api/dir_src_functional.html). For a unary operator, this is +[/src/functional](dir_src_functional). For a unary operator, this is the macro `UNARY`. ```cpp @@ -392,7 +392,7 @@ representation. Furthermore, the OpenMPI and OpenMP libraries are employed for parallelisation. While macros provided in -[/src/common/definitions.h](api/file_src_common_definitions.h.html) locally +[/src/common/definitions.h](file_src_common_definitions.h) locally enable faster floating-point math in supported compilers. ```cpp @@ -402,14 +402,14 @@ MARIAN_FFAST_MATH_END ``` The usual caveats apply when enabling `fast_math`, and can be found in -[/src/common/definitions.h](api/file_src_common_definitions.h.html) +[/src/common/definitions.h](file_src_common_definitions.h) Tensor operators are declared in -[/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html), +[/src/tensors/tensor_operators.h](file_src_tensors_tensor_operators.h), these are device-agnostic function that call the relevant device-specific implementation. The CPU- and GPU-specific implementation are defined in `cpu` -namespace in [/src/tensors/cpu/](api/dir_src_tensors_cpu.html) and the `gpu` -namespace [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Therefore a typical +namespace in [/src/tensors/cpu/](dir_src_tensors_cpu) and the `gpu` +namespace [/src/tensors/gpu/](dir_src_tensors_gpu). Therefore a typical operator defers to an implementation in the device-specific namespace. ```cpp @@ -461,16 +461,16 @@ compilation: ``` To fix these undefined references, we must explicitly add the specialization to -the `.inc` files of [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Each +the `.inc` files of [/src/tensors/gpu/](dir_src_tensors_gpu). Each `.inc` file is included at the end of its corresponding `.cu` file, ensuring that the specialization is compiled. The undefined references should be added to the `.inc` file that corresponds to the header file in which contains the declaration of the missing functions. -The file [element.inc](api/file_src_tensors_gpu_element.inc.html) contains the +The file [element.inc](file_src_tensors_gpu_element.inc) contains the specializations of the function defined in -[element.h](api/file_src_tensors_gpu_element.h.html): +[element.h](file_src_tensors_gpu_element.h): ```cpp // src/tensors/gpu/element.h @@ -478,9 +478,9 @@ template void Element(Functor functor, Tensor out, Tensors... tensors); ``` -Similarly, [add.inc](api/file_src_tensors_gpu_add.inc.html) contains the +Similarly, [add.inc](file_src_tensors_gpu_add.inc) contains the specializations for functions matching either of the two signatures in -[add.h](api/file_src_tensors_gpu_add.h.html): +[add.h](file_src_tensors_gpu_add.h): ```cpp // src/tensors/gpu/add.h @@ -491,8 +491,8 @@ template void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors); ``` -Finally [add_all.inc](api/file_src_tensors_gpu_add_all.inc.html) contains the -specializations for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), which +Finally [add_all.inc](file_src_tensors_gpu_add_all.inc) contains the +specializations for [add_all.h](file_src_tensors_gpu_add_all.h), which are several versions of: ```cpp @@ -507,7 +507,7 @@ void AggregateAll(Ptr allocator, const Tensor in1); ``` -However, for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), there is an +However, for [add_all.h](file_src_tensors_gpu_add_all.h), there is an additional type dependence in the first template parameter, which requires two entries: diff --git a/doc/requirements.txt b/doc/requirements.txt index b4b2038aa..a2416e9a1 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -2,6 +2,7 @@ sphinx==2.4.4 breathe==4.13.0 exhale sphinx_rtd_theme -recommonmark +myst-parser==0.14.0a3 mistune<2.0.0 m2r +sphinx-mathjax-offline diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index e34ddc8ac..dc756c7d6 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -19,7 +19,7 @@ Expr checkpoint(Expr a); typedef Expr(ActivationFunction)(Expr); ///< ActivationFunction has signature Expr(Expr) /** - * Convience typedef for graph @ref lambda expressions. + * Convenience typedef for graph @ref lambda expressions. */ typedef std::function& in)> LambdaNodeFunctor; @@ -114,7 +114,7 @@ Expr tanh(const std::vector& nodes); /** * @copybrief tanh - * Convience function to put parameter pack @p Args into a Expr vector + * Convenience function to put parameter pack @p Args into a Expr vector */ template Expr tanh(Args... args) { @@ -188,8 +188,7 @@ Expr prelu(const std::vector&, float alpha = 0.01); * @{ */ -///@name Exponentiation and Logarithmic functions -///@{ +// Exponentiation and Logarithmic functions /** * Natural logarithm. * Computes the element-wise natural logarithm of the expression: @f$ \log(a) @f$ @@ -203,10 +202,8 @@ Expr log(Expr a); * @see ExpNodeOp */ Expr exp(Expr a); -///@} -///@name Trigonometric functions -///@{ +// Trigonometric functions /** * Sine. Computes the element-wise sine of the expression: @f$ \sin(a) @f$. * @see SinNodeOp @@ -225,7 +222,6 @@ Expr cos(Expr a); */ Expr tan(Expr a); ///@} -///@} /** * @addtogroup graph_ops_arithmetic Arithmetic @@ -238,52 +234,42 @@ Expr tan(Expr a); * Returns @f$ -a @f$. * @see NegNodeOp for implementation. */ -///@{ Expr operator-(Expr a); -///@} /*********************************************************/ /** - * @name Addition + * Addition * Performs @f$ a + b @f$ in the expression graph. */ -///@{ Expr operator+(Expr a, Expr b); ///< @see Implementation in PlusNodeOp Expr operator+(float a, Expr b); ///< @see Implementation in ScalarAddNodeOp Expr operator+(Expr a, float b); ///< @see Implementation in ScalarAddNodeOp -///@} /** - * @name Subtraction + * Subtraction * Performs @f$ a - b @f$ in the expression graph. */ -///@{ Expr operator-(Expr a, Expr b); ///< @see Implementation in MinusNodeOp Expr operator-(float a, Expr b); ///< @see Implementation in ScalarAddNodeOp Expr operator-(Expr a, float b); ///< @see Implementation in ScalarAddNodeOp -///@} /** - * @name Multiplication + * Multiplication * Performs @f$ a * b @f$ in the expression graph. */ -///@{ Expr operator*(Expr a, Expr b); ///< @see Implementation in MultNodeOp Expr operator*(float a, Expr b); ///< @see Implementation in ScalarMultNodeOp Expr operator*(Expr a, float b); ///< @see Implementation in ScalarMultNodeOp -///@} /** - * @name Division + * Division * Performs @f$ a / b @f$ in the expression graph. */ -///@{ Expr operator/(Expr a, Expr b); ///< @see Implementation in DivNodeOp Expr operator/(float a, Expr b); ///< Promotes @p a to Expression and uses operator/(Expr a, Expr b). ///< @todo efficient version of this without ExpressionGraph::constant Expr operator/(Expr a, float b); ///< Implementation via @f$ a * \frac{1}{b} @f$. -///@} ///@} @@ -324,14 +310,13 @@ Expr logaddexp(Expr a, Expr b); ///@addtogroup graph_ops_mathematical ///@{ -/** - * @name Element-wise min/max +/* + * Element-wise min/max * Performs an element-wise min max comparison between expressions. * @see min, max for axis level operations * @see MinimumNodeOp, MaximumNodeOp * @todo implement version without ExpressionGraph::constant. */ -///@{ /** * Computes the element-wise maximum of its inputs. @@ -367,7 +352,6 @@ Expr minimum(float a, Expr b); */ Expr minimum(Expr a, float b); ///@} -///@} /** * Pair of expressions. @@ -428,23 +412,20 @@ Expr2 argmin(Expr a, int axis); * @{ */ -/** - * @name Expr-Expr comparisons +/* + * Expr-Expr comparisons */ -///@{ Expr lt(Expr a, Expr b); ///< @f$ a < b @f$ Expr eq(Expr a, Expr b); ///< @f$ a \equiv b @f$ Expr gt(Expr a, Expr b); ///< @f$ a > b @f$ Expr ge(Expr a, Expr b); ///< @f$ a \geq b @f$ Expr ne(Expr a, Expr b); ///< @f$ a \neq b @f$ Expr le(Expr a, Expr b); ///< @f$ a \leq b @f$ -///@} -/** - * @name Float-Expr comparisons +/* + * Float-Expr comparisons * Floats are promoted to a @ref ExpressionGraph::constant and use the Expr-Expr methods */ -///@{ Expr lt(float a, Expr b); ///< @f$ a < b @f$ Expr eq(float a, Expr b); ///< @f$ a \equiv b @f$ Expr gt(float a, Expr b); ///< @f$ a > b @f$ @@ -458,7 +439,6 @@ Expr gt(Expr a, float b); ///< @f$ a > b @f$ Expr ge(Expr a, float b); ///< @f$ a \geq b @f$ Expr ne(Expr a, float b); ///< @f$ a \neq b @f$ Expr le(Expr a, float b); ///< @f$ a \leq b @f$ -///@} ///@} @@ -810,8 +790,7 @@ static inline Expr narrow(Expr a, int axis, size_t start, size_t length) { ///@addtogroup graph_ops_mathematical ///@{ -///@name Aggregations -///@{ +// Aggregations /** * Compute the sum along the specified axis. @@ -862,7 +841,6 @@ Expr min(Expr a, int ax); */ Expr prod(Expr a, int ax); -///@} ///@} /** From e8a1a2530fb84cbff7383302ebca393e5875c441 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 7 Dec 2021 17:47:33 +0000 Subject: [PATCH 131/254] Fix AVX2+ detection on Mac (#895) MacOS is weird and its CPU flags are separated in two separate fields returned by the sysctl interface. To get around this, we need to test both of them, so here goes Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + cmake/FindSSE.cmake | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 921924ccc..80715ad33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option ### Fixed +- Fix AVX2 and AVX512 detection on MacOS - Add GCC11 support into FBGEMM - Added pragma to ignore unused-private-field error on elementType_ on macOS - Do not set guided alignments for case augmented data if vocab is not factored diff --git a/cmake/FindSSE.cmake b/cmake/FindSSE.cmake index 82ee7f3e0..e1c58fbc9 100644 --- a/cmake/FindSSE.cmake +++ b/cmake/FindSSE.cmake @@ -74,8 +74,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ENDIF (AVX512_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") - EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE - CPUINFO) + EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features machdep.cpu.leaf7_features" OUTPUT_VARIABLE CPUINFO) STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO}) STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE) From e26e5b6faf7c826868efe12316e0b02499d4edcf Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Thu, 16 Dec 2021 15:07:34 +0000 Subject: [PATCH 132/254] Use apple accelerate on MacOs by default (#897) --- .github/workflows/macos.yml | 11 ++--------- CHANGELOG.md | 1 + CMakeLists.txt | 6 +++++- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 5e3e57c63..20907d9b6 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -18,16 +18,10 @@ jobs: submodules: recursive - name: Install dependencies - run: brew install boost openblas openssl protobuf + run: brew install boost openssl protobuf - # Openblas location is exported explicitly because openblas is keg-only, - # which means it was not symlinked into /usr/local/. - # CMake cannot find BLAS on GitHub runners if Marian is being compiled - # statically, hence USE_STATIC_LIBS=off - name: Configure CMake run: | - export LDFLAGS="-L/usr/local/opt/openblas/lib" - export CPPFLAGS="-I/usr/local/opt/openblas/include" mkdir -p build cd build cmake .. \ @@ -37,8 +31,7 @@ jobs: -DCOMPILE_SERVER=on \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ - -DUSE_SENTENCEPIECE=on \ - -DUSE_STATIC_LIBS=off + -DUSE_SENTENCEPIECE=on - name: Compile working-directory: build diff --git a/CHANGELOG.md b/CHANGELOG.md index 80715ad33..a5dd305f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Enforce validation for the task alias in training mode. ### Changed +- MacOS marian uses Apple Accelerate framework by default, as opposed to openblas/mkl. - Optimize LSH for speed by treating is as a shortlist generator. No option changes in decoder - Set REQUIRED_BIAS_ALIGNMENT = 16 in tensors/gpu/prod.cpp to avoid memory-misalignment on certain Ampere GPUs. - For BUILD_ARCH != native enable all intrinsics types by default, can be disabled like this: -DCOMPILE_AVX512=off diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e6f24c74..eb6ca97b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,11 @@ option(COMPILE_CUDA "Compile GPU version" ON) option(COMPILE_EXAMPLES "Compile examples" OFF) option(COMPILE_SERVER "Compile marian-server" OFF) option(COMPILE_TESTS "Compile tests" OFF) -option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF) +if(APPLE) + option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" ON) +else(APPLE) + option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF) +endif(APPLE) option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) From c84599d08ad69059279abd5a7417a8053db8b631 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 16 Dec 2021 15:07:55 +0000 Subject: [PATCH 133/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 89f579488..1dd05f471 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.25 +v1.10.26 From b29cc07a95f49df7825f3a92e860bd642db0e812 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Tue, 18 Jan 2022 12:58:52 +0000 Subject: [PATCH 134/254] Scorer model loading (#860) * Add MMAP as an option * Use io::isBin * Allow getYamlFromModel from an Item vector * ScorerWrapper can now load on to a graph from Item vector The interface IEncoderDecoder can now call graph loads directly from an Item Vector. * Translator loads model before creating scorers Scorers are created from an Item vector * Replace model-config try-catch with check using IsNull * Prefer empty vs size * load by items should be pure virtual * Stepwise forward load to encdec * nematus can load from items * amun can load from items * loadItems in TranslateService * Remove logging * Remove by filename scorer functions * Replace by filename createScorer * Explicitly provide default value for get model-mmap * CLI option for model-mmap only for translation and CPU compile * Ensure model-mmap option is CPU only * Remove move on temporary object * Reinstate log messages for model loading in Amun / Nematus * Add log messages for model loading in scorers Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + src/common/config_parser.cpp | 7 ++- src/common/config_validator.cpp | 3 ++ src/common/io.cpp | 12 +++++ src/common/io.h | 1 + src/graph/expression_graph.h | 2 +- src/models/amun.h | 14 ++++-- src/models/costs.h | 6 +++ src/models/encoder_decoder.cpp | 6 +++ src/models/encoder_decoder.h | 9 ++++ src/models/nematus.h | 16 ++++--- src/translator/scorers.cpp | 51 ++++++++++++++-------- src/translator/scorers.h | 21 ++++++++- src/translator/translator.h | 77 +++++++++++++++++++-------------- 14 files changed, 162 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5dd305f7..d42c652ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Dynamic gradient-scaling with `--dynamic-gradient-scaling`. - Add unit tests for binary files. - Fix compilation with OMP +- Added `--model-mmap` option to enable mmap loading for CPU-based translation - Compute aligned memory sizes using exact sizing - Support for loading lexical shortlist from a binary blob - Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 8da9520c8..9705d5b7a 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -183,7 +183,12 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Path prefix for pre-trained model to initialize model weights"); } } - +#ifdef COMPILE_CPU + if(mode_ == cli::mode::translation) { + cli.add("--model-mmap", + "Use memory-mapping when loading model (CPU only)"); + } +#endif cli.add("--ignore-model-config", "Ignore the model configuration saved in npz file"); cli.add("--type", diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index fea7578f3..b0230da99 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -54,6 +54,9 @@ void ConfigValidator::validateOptionsTranslation() const { ABORT_IF(models.empty() && configs.empty(), "You need to provide at least one model file or a config file"); + ABORT_IF(get("model-mmap") && get("cpu-threads") == 0, + "Model MMAP is CPU-only, please use --cpu-threads"); + for(const auto& modelFile : models) { filesystem::Path modelPath(modelFile); ABORT_IF(!filesystem::exists(modelPath), "Model file does not exist: " + modelFile); diff --git a/src/common/io.cpp b/src/common/io.cpp index a9984b5df..e0b3f39a5 100644 --- a/src/common/io.cpp +++ b/src/common/io.cpp @@ -56,6 +56,18 @@ void getYamlFromModel(YAML::Node& yaml, yaml = YAML::Load(item.data()); } +// Load YAML from item +void getYamlFromModel(YAML::Node& yaml, + const std::string& varName, + const std::vector& items) { + for(auto& item : items) { + if(item.name == varName) { + yaml = YAML::Load(item.data()); + return; + } + } +} + void addMetaToItems(const std::string& meta, const std::string& varName, std::vector& items) { diff --git a/src/common/io.h b/src/common/io.h index 2d18d66e8..3f340ed2f 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -21,6 +21,7 @@ bool isBin(const std::string& fileName); void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::string& fileName); void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const void* ptr); +void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::vector& items); void addMetaToItems(const std::string& meta, const std::string& varName, diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 553a5d63b..c532abffd 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -739,7 +739,7 @@ class ExpressionGraph : public std::enable_shared_from_this { public: /** Load model (mainly parameter objects) from array of io::Items */ - void load(std::vector& ioItems, bool markReloaded = true) { + void load(const std::vector& ioItems, bool markReloaded = true) { setReloaded(false); for(auto& item : ioItems) { std::string pName = item.name; diff --git a/src/models/amun.h b/src/models/amun.h index 1bfda2697..135ce3597 100644 --- a/src/models/amun.h +++ b/src/models/amun.h @@ -36,7 +36,7 @@ class Amun : public EncoderDecoder { } void load(Ptr graph, - const std::string& name, + const std::vector& items, bool /*markedReloaded*/ = true) override { std::map nameMap = {{"decoder_U", "decoder_cell1_U"}, @@ -89,9 +89,7 @@ class Amun : public EncoderDecoder { if(opt("tied-embeddings-src") || opt("tied-embeddings-all")) nameMap["Wemb"] = "Wemb"; - LOG(info, "Loading model from {}", name); - // load items from .npz file - auto ioItems = io::loadItems(name); + auto ioItems = items; // map names and remove a dummy matrices for(auto it = ioItems.begin(); it != ioItems.end();) { // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size @@ -120,6 +118,14 @@ class Amun : public EncoderDecoder { graph->load(ioItems); } + void load(Ptr graph, + const std::string& name, + bool /*markReloaded*/ = true) override { + LOG(info, "Loading model from {}", name); + auto ioItems = io::loadItems(name); + load(graph, ioItems); + } + void save(Ptr graph, const std::string& name, bool saveTranslatorConfig = false) override { diff --git a/src/models/costs.h b/src/models/costs.h index e5463bfd0..982a13c57 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -325,6 +325,12 @@ class Stepwise : public IEncoderDecoder { public: Stepwise(Ptr encdec, Ptr cost) : encdec_(encdec), cost_(cost) {} + virtual void load(Ptr graph, + const std::vector& items, + bool markedReloaded = true) override { + encdec_->load(graph, items, markedReloaded); + } + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override { diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 66ff16cec..bb938ee55 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -144,6 +144,12 @@ std::string EncoderDecoder::getModelParametersAsString() { return std::string(out.c_str()); } +void EncoderDecoder::load(Ptr graph, + const std::vector& items, + bool markedReloaded) { + graph->load(items, markedReloaded && !opt("ignore-model-config", false)); +} + void EncoderDecoder::load(Ptr graph, const std::string& name, bool markedReloaded) { diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h index 92c1647fa..0fbf3fafe 100644 --- a/src/models/encoder_decoder.h +++ b/src/models/encoder_decoder.h @@ -12,6 +12,11 @@ namespace marian { class IEncoderDecoder : public models::IModel { public: virtual ~IEncoderDecoder() {} + + virtual void load(Ptr graph, + const std::vector& items, + bool markedReloaded = true) = 0; + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override @@ -91,6 +96,10 @@ class EncoderDecoder : public IEncoderDecoder, public LayerBase { void push_back(Ptr decoder); + virtual void load(Ptr graph, + const std::vector& items, + bool markedReloaded = true) override; + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override; diff --git a/src/models/nematus.h b/src/models/nematus.h index 730418e5f..aee8e3b04 100644 --- a/src/models/nematus.h +++ b/src/models/nematus.h @@ -26,11 +26,9 @@ class Nematus : public EncoderDecoder { } void load(Ptr graph, - const std::string& name, + const std::vector& items, bool /*markReloaded*/ = true) override { - LOG(info, "Loading model from {}", name); - // load items from .npz file - auto ioItems = io::loadItems(name); + auto ioItems = items; // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node for(auto it = ioItems.begin(); it != ioItems.end();) { // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size @@ -41,7 +39,7 @@ class Nematus : public EncoderDecoder { it->shape.set(0, 1); it->shape.set(1, dim); } - + if(it->name == "decoder_c_tt") { it = ioItems.erase(it); } else if(it->name == "uidx") { @@ -59,6 +57,14 @@ class Nematus : public EncoderDecoder { graph->load(ioItems); } + void load(Ptr graph, + const std::string& name, + bool /*markReloaded*/ = true) override { + LOG(info, "Loading model from {}", name); + auto ioItems = io::loadItems(name); + load(graph, ioItems); + } + void save(Ptr graph, const std::string& name, bool saveTranslatorConfig = false) override { diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp index d1c8b1602..60ec03dd1 100644 --- a/src/translator/scorers.cpp +++ b/src/translator/scorers.cpp @@ -5,7 +5,7 @@ namespace marian { Ptr scorerByType(const std::string& fname, float weight, - const std::string& model, + std::vector items, Ptr options) { options->set("inference", true); std::string type = options->get("type"); @@ -22,7 +22,7 @@ Ptr scorerByType(const std::string& fname, LOG(info, "Loading scorer of type {} as feature {}", type, fname); - return New(encdec, fname, weight, model); + return New(encdec, fname, weight, items); } Ptr scorerByType(const std::string& fname, @@ -47,30 +47,30 @@ Ptr scorerByType(const std::string& fname, return New(encdec, fname, weight, ptr); } -std::vector> createScorers(Ptr options) { +std::vector> createScorers(Ptr options, const std::vector> models) { std::vector> scorers; - auto models = options->get>("models"); - std::vector weights(models.size(), 1.f); if(options->hasAndNotEmpty("weights")) weights = options->get>("weights"); bool isPrevRightLeft = false; // if the previous model was a right-to-left model size_t i = 0; - for(auto model : models) { + for(auto items : models) { std::string fname = "F" + std::to_string(i); // load options specific for the scorer auto modelOptions = New(options->clone()); - try { - if(!options->get("ignore-model-config")) { - YAML::Node modelYaml; - io::getYamlFromModel(modelYaml, "special:model.yml", model); + if(!options->get("ignore-model-config")) { + YAML::Node modelYaml; + io::getYamlFromModel(modelYaml, "special:model.yml", items); + if(!modelYaml.IsNull()) { + LOG(info, "Loaded model config"); modelOptions->merge(modelYaml, true); } - } catch(std::runtime_error&) { - LOG(warn, "No model settings found in model file"); + else { + LOG(warn, "No model settings found in model file"); + } } // l2r and r2l cannot be used in the same ensemble @@ -85,13 +85,24 @@ std::vector> createScorers(Ptr options) { } } - scorers.push_back(scorerByType(fname, weights[i], model, modelOptions)); + scorers.push_back(scorerByType(fname, weights[i], items, modelOptions)); i++; } return scorers; } +std::vector> createScorers(Ptr options) { + std::vector> model_items; + auto models = options->get>("models"); + for(auto model : models) { + auto items = io::loadItems(model); + model_items.push_back(std::move(items)); + } + + return createScorers(options, model_items); +} + std::vector> createScorers(Ptr options, const std::vector& ptrs) { std::vector> scorers; @@ -105,14 +116,16 @@ std::vector> createScorers(Ptr options, const std::vector(options->clone()); - try { - if(!options->get("ignore-model-config")) { - YAML::Node modelYaml; - io::getYamlFromModel(modelYaml, "special:model.yml", ptr); + if(!options->get("ignore-model-config")) { + YAML::Node modelYaml; + io::getYamlFromModel(modelYaml, "special:model.yml", ptr); + if(!modelYaml.IsNull()) { + LOG(info, "Loaded model config"); modelOptions->merge(modelYaml, true); } - } catch(std::runtime_error&) { - LOG(warn, "No model settings found in model file"); + else { + LOG(warn, "No model settings found in model file"); + } } scorers.push_back(scorerByType(fname, weights[i], ptr, modelOptions)); diff --git a/src/translator/scorers.h b/src/translator/scorers.h index a5a0be2cb..72ebff5df 100644 --- a/src/translator/scorers.h +++ b/src/translator/scorers.h @@ -73,9 +73,19 @@ class ScorerWrapper : public Scorer { private: Ptr encdec_; std::string fname_; + std::vector items_; const void* ptr_; public: + ScorerWrapper(Ptr encdec, + const std::string& name, + float weight, + std::vector& items) + : Scorer(name, weight), + encdec_(std::static_pointer_cast(encdec)), + items_(items), + ptr_{0} {} + ScorerWrapper(Ptr encdec, const std::string& name, float weight, @@ -97,7 +107,9 @@ class ScorerWrapper : public Scorer { virtual void init(Ptr graph) override { graph->switchParams(getName()); - if(ptr_) + if(!items_.empty()) + encdec_->load(graph, items_); + else if(ptr_) encdec_->mmap(graph, ptr_); else encdec_->load(graph, fname_); @@ -142,12 +154,19 @@ class ScorerWrapper : public Scorer { } }; +Ptr scorerByType(const std::string& fname, + float weight, + std::vector items, + Ptr options); + Ptr scorerByType(const std::string& fname, float weight, const std::string& model, Ptr config); + std::vector> createScorers(Ptr options); +std::vector> createScorers(Ptr options, const std::vector> models); Ptr scorerByType(const std::string& fname, float weight, diff --git a/src/translator/translator.h b/src/translator/translator.h index db1f3d030..4084ced95 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -20,12 +20,7 @@ #include "translator/scorers.h" // currently for diagnostics only, will try to mmap files ending in *.bin suffix when enabled. -// @TODO: add this as an actual feature. -#define MMAP 0 - -#if MMAP #include "3rd_party/mio/mio.hpp" -#endif namespace marian { @@ -42,9 +37,8 @@ class Translate : public ModelTask { size_t numDevices_; -#if MMAP - std::vector mmaps_; -#endif + std::vector model_mmaps_; // map + std::vector> model_items_; // non-mmap public: Translate(Ptr options) @@ -76,15 +70,21 @@ class Translate : public ModelTask { scorers_.resize(numDevices_); graphs_.resize(numDevices_); -#if MMAP auto models = options->get>("models"); - for(auto model : models) { - marian::filesystem::Path modelPath(model); - ABORT_IF(modelPath.extension() != marian::filesystem::Path(".bin"), - "Non-binarized models cannot be mmapped"); - mmaps_.push_back(std::move(mio::mmap_source(model))); + if(options_->get("model-mmap", false)) { + for(auto model : models) { + ABORT_IF(!io::isBin(model), "Non-binarized models cannot be mmapped"); + LOG(info, "Loading model from {}", model); + model_mmaps_.push_back(mio::mmap_source(model)); + } + } + else { + for(auto model : models) { + LOG(info, "Loading model from {}", model); + auto items = io::loadItems(model); + model_items_.push_back(std::move(items)); + } } -#endif size_t id = 0; for(auto device : devices) { @@ -101,11 +101,14 @@ class Translate : public ModelTask { graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; -#if MMAP - auto scorers = createScorers(options_, mmaps_); -#else - auto scorers = createScorers(options_); -#endif + std::vector> scorers; + if(options_->get("model-mmap", false)) { + scorers = createScorers(options_, model_mmaps_); + } + else { + scorers = createScorers(options_, model_items_); + } + for(auto scorer : scorers) { scorer->init(graph); if(shortlistGenerator_) @@ -146,11 +149,11 @@ class Translate : public ModelTask { std::mutex syncCounts; // timer and counters for total elapsed time and statistics - std::unique_ptr totTimer(new timer::Timer()); + std::unique_ptr totTimer(new timer::Timer()); size_t totBatches = 0; size_t totLines = 0; size_t totSourceTokens = 0; - + // timer and counters for elapsed time and statistics between updates std::unique_ptr curTimer(new timer::Timer()); size_t curBatches = 0; @@ -176,7 +179,7 @@ class Translate : public ModelTask { bg.prepare(); for(auto batch : bg) { auto task = [=, &syncCounts, - &totBatches, &totLines, &totSourceTokens, &totTimer, + &totBatches, &totLines, &totSourceTokens, &totTimer, &curBatches, &curLines, &curSourceTokens, &curTimer](size_t id) { thread_local Ptr graph; thread_local std::vector> scorers; @@ -200,12 +203,12 @@ class Translate : public ModelTask { } // if we asked for speed information display this - if(statFreq.n > 0) { + if(statFreq.n > 0) { std::lock_guard lock(syncCounts); - totBatches++; + totBatches++; totLines += batch->size(); totSourceTokens += batch->front()->batchWords(); - + curBatches++; curLines += batch->size(); curSourceTokens += batch->front()->batchWords(); @@ -214,10 +217,10 @@ class Translate : public ModelTask { double totTime = totTimer->elapsed(); double curTime = curTimer->elapsed(); - LOG(info, - "Processed {} batches, {} lines, {} source tokens in {:.2f}s - Speed (since last): {:.2f} batches/s - {:.2f} lines/s - {:.2f} tokens/s", + LOG(info, + "Processed {} batches, {} lines, {} source tokens in {:.2f}s - Speed (since last): {:.2f} batches/s - {:.2f} lines/s - {:.2f} tokens/s", totBatches, totLines, totSourceTokens, totTime, curBatches / curTime, curLines / curTime, curSourceTokens / curTime); - + // reset stats between updates curBatches = curLines = curSourceTokens = 0; curTimer.reset(new timer::Timer()); @@ -230,12 +233,12 @@ class Translate : public ModelTask { // make sure threads are joined before other local variables get de-allocated threadPool.join_all(); - + // display final speed numbers over total translation if intermediate displays were requested if(statFreq.n > 0) { double totTime = totTimer->elapsed(); - LOG(info, - "Processed {} batches, {} lines, {} source tokens in {:.2f}s - Speed (total): {:.2f} batches/s - {:.2f} lines/s - {:.2f} tokens/s", + LOG(info, + "Processed {} batches, {} lines, {} source tokens in {:.2f}s - Speed (total): {:.2f} batches/s - {:.2f} lines/s - {:.2f} tokens/s", totBatches, totLines, totSourceTokens, totTime, totBatches / totTime, totLines / totTime, totSourceTokens / totTime); } } @@ -288,6 +291,14 @@ class TranslateService : public ModelServiceTask { auto devices = Config::getDevices(options_); numDevices_ = devices.size(); + // preload models + std::vector> model_items_; + auto models = options->get>("models"); + for(auto model : models) { + auto items = io::loadItems(model); + model_items_.push_back(std::move(items)); + } + // initialize scorers for(auto device : devices) { auto graph = New(true); @@ -303,7 +314,7 @@ class TranslateService : public ModelServiceTask { graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); - auto scorers = createScorers(options_); + auto scorers = createScorers(options_, model_items_); for(auto scorer : scorers) { scorer->init(graph); if(shortlistGenerator_) From b64e258bda3d9134a39f41229776b630bc187094 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 18 Jan 2022 12:59:37 +0000 Subject: [PATCH 135/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 1dd05f471..447243316 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.26 +v1.10.27 From 894a07ad5b42a18fc174bedf0f2abe5e358fcc19 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Mon, 24 Jan 2022 15:28:13 +0000 Subject: [PATCH 136/254] Improve checks on transformer cache (#881) * Fix caching in transformer attention * Move hash specialization * Swap comments to doxygen * Include string header --- src/CMakeLists.txt | 1 + src/common/hash.cpp | 12 +++++++ src/common/hash.h | 24 +++++++++++--- src/models/transformer.h | 69 ++++++++++++++++++++-------------------- 4 files changed, 67 insertions(+), 39 deletions(-) create mode 100644 src/common/hash.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4599c407..3718807a5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,7 @@ set(MARIAN_SOURCES common/filesystem.cpp common/file_stream.cpp common/file_utils.cpp + common/hash.cpp common/signal_handling.cpp common/types.cpp diff --git a/src/common/hash.cpp b/src/common/hash.cpp new file mode 100644 index 000000000..57e5e9145 --- /dev/null +++ b/src/common/hash.cpp @@ -0,0 +1,12 @@ +#include + +#include "hash.h" +#include "common/shape.h" + +namespace std { +size_t hash>::operator()(pair const& k) const { + size_t seed = hash{}(k.first); + marian::util::hash_combine(seed, k.second.hash()); + return seed; +} +} // namespace std diff --git a/src/common/hash.h b/src/common/hash.h index 7aca30de2..37dab5e76 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -7,16 +7,18 @@ namespace util { template using hash = std::hash; -// This combinator is based on boost::hash_combine, but uses -// std::hash as the hash implementation. Used as a drop-in -// replacement for boost::hash_combine. +/** + * Combine hash values. + * This combinator is based on boost::hash_combine, but uses std::hash as the hash implementation. + * Used as a drop-in replacement for boost::hash_combine. + */ template inline void hash_combine(HashType& seed, T const& v) { hash hasher; seed ^= static_cast(hasher(v)) + 0x9e3779b9 + (seed<<6) + (seed>>2); } -// Hash a whole chunk of memory, mostly used for diagnostics +/** Hash a whole chunk of memory. */ template inline HashType hashMem(const T* beg, size_t len) { HashType seed = 0; @@ -25,5 +27,17 @@ inline HashType hashMem(const T* beg, size_t len) { return seed; } -} +} // namespace util + +struct Shape; // Forward declaration +} // namespace marian + +namespace std { +/** + * std::hash specialization for the string-shape pair used as a cache key in transformer.h. + */ +template <> +struct hash> { + size_t operator()(pair const& k) const; +}; } diff --git a/src/models/transformer.h b/src/models/transformer.h index 7ec40dc58..af877600e 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -5,6 +5,7 @@ #include "marian.h" +#include "common/hash.h" #include "layers/constructors.h" #include "models/decoder.h" #include "models/encoder.h" @@ -28,7 +29,7 @@ class Transformer : public EncoderOrDecoderBase { protected: using Base::options_; using Base::inference_; using Base::batchIndex_; using Base::graph_; - std::unordered_map cache_; // caching transformation of the encoder that should not be created again + std::unordered_map, Expr> cache_; // caching transformation of the encoder that should not be created again mutable/*lazy*/ std::vector sinusoidalEmbeddingsFreq_, sinusoidalEmbeddingsOffs_; // cached contributions to sinusoidal embeddings bool depthScaling_{false}; // As recommended in the GPT-2 paper, down-scale layer weights by a factor of 1 / sqrt(depth); @@ -40,16 +41,16 @@ class Transformer : public EncoderOrDecoderBase { std::vector alignments_; // [max tgt len or 1][beam depth, max src length, batch size, 1] // @TODO: make this go away - template - T opt(const char* const key) const { Ptr options = options_; return options->get(key); } + template + T opt(const char* const key) const { Ptr options = options_; return options->get(key); } - template - T opt(const std::string& key) const { return opt(key.c_str()); } + template + T opt(const std::string& key) const { return opt(key.c_str()); } - template + template T opt(const char* const key, const T& def) const { Ptr options = options_; return options->get(key, def); } - template + template T opt(const std::string& key, const T& def) const { opt(key.c_str(), def); } public: @@ -256,7 +257,7 @@ class Transformer : public EncoderOrDecoderBase { // take softmax along src sequence axis (-1) auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] - + if(saveAttentionWeights) collectOneHead(weights, dimBeam); @@ -289,26 +290,26 @@ class Transformer : public EncoderOrDecoderBase { // Caching transformation of the encoder that should not be created again. // @TODO: set this automatically by memoizing encoder context and // memoization propagation (short-term) - if (cache // if caching - && cache_.count(prefix + "_keys") > 0 // and the keys expression has been seen - && cache_[prefix + "_keys"]->shape().elements() == keys->shape().elements()) { // and the underlying element size did not change - kh = cache_[prefix + "_keys"]; // then return cached tensor - } - else { + std::pair, Expr>::iterator, bool> cache_result; + if (cache + && !((cache_result = cache_.insert(std::pair, Expr>({prefix + "_keys", keys->shape()}, kh))).second) + ) { + kh = cache_result.first->second; + } else { int dimKeys = keys->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation auto Wk = graph_->param(prefix + "_Wk", {dimKeys, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bk = graph_->param(prefix + "_bk", {1, dimModel}, inits::zeros()); kh = affine(keys, Wk, bk); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] kh = SplitHeads(kh, dimHeads); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim] - cache_[prefix + "_keys"] = kh; + if (cache) cache_result.first->second = kh; } Expr vh; - if (cache - && cache_.count(prefix + "_values") > 0 - && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) { - vh = cache_[prefix + "_values"]; + if (cache + && !((cache_result = cache_.insert(std::pair, Expr>({prefix + "_values", values->shape()}, vh))).second) + ) { + vh = cache_result.first->second; } else { int dimValues = values->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation auto Wv = graph_->param(prefix + "_Wv", {dimValues, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); @@ -316,7 +317,7 @@ class Transformer : public EncoderOrDecoderBase { vh = affine(values, Wv, bv); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim] vh = SplitHeads(vh, dimHeads); - cache_[prefix + "_values"] = vh; + if (cache) cache_result.first->second = vh; } int dimBeam = q->shape()[-4]; @@ -377,7 +378,7 @@ class Transformer : public EncoderOrDecoderBase { // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); - + auto opsPost = opt("transformer-postprocess"); output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb); @@ -558,7 +559,7 @@ class EncoderTransformer : public Transformer { auto embeddingLayer = getEmbeddingLayer(opt("ulr", false)); std::tie(batchEmbeddings, batchMask) = embeddingLayer->apply((*batch)[batchIndex_]); batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch); - + // reorganize batch and timestep batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim] batchMask = atleast_nd(batchMask, 4); // [beam depth=1, max length, batch size, vector dim=1] @@ -593,7 +594,7 @@ class EncoderTransformer : public Transformer { } // this allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); layer = postProcess(prefix_ + "_top", opsTop, layer, prevLayer, dropProb); @@ -622,14 +623,14 @@ class TransformerState : public DecoderState { int beamSize) const override { // @TODO: code duplication with DecoderState only because of isBatchMajor=true, should rather be a contructor argument of DecoderState? - + std::vector> newEncStates; - for(auto& es : encStates_) - // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries + for(auto& es : encStates_) + // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); // Create hypothesis-selected state based on current state and hyp indices - auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); + auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); // Set the same target token position as the current state // @TODO: This is the same as in base function. @@ -763,8 +764,8 @@ class DecoderTransformer : public Transformer { // This would happen if something goes wrong during batch pruning. ABORT_IF(encoderContext->shape()[-3] != dimBatch, - "Context and query batch dimension do not match {} != {}", - encoderContext->shape()[-3], + "Context and query batch dimension do not match {} != {}", + encoderContext->shape()[-3], dimBatch); // LayerAttention expects mask in a different layout @@ -871,7 +872,7 @@ class DecoderTransformer : public Transformer { } // This allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); query = postProcess(prefix_ + "_top", opsTop, query, prevQuery, dropProb); @@ -884,7 +885,7 @@ class DecoderTransformer : public Transformer { if(shortlist_) output_->setShortlist(shortlist_); auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] - + // return unormalized(!) probabilities Ptr nextState; if (opt("transformer-decoder-autoreg", "self-attention") == "rnn") { @@ -909,9 +910,9 @@ class DecoderTransformer : public Transformer { output_->clear(); cache_.clear(); alignments_.clear(); - perLayerRnn_.clear(); // this needs to be cleared between batches. - // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, - // but where underlying memory has been deallocated by dropping all tensors + perLayerRnn_.clear(); // this needs to be cleared between batches. + // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, + // but where underlying memory has been deallocated by dropping all tensors // from a TensorAllocator object. This can happen during ExpressionGraph::clear() } }; From 3b458b044e6b2695ba6ad0786320ea043d076772 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 24 Jan 2022 15:28:37 +0000 Subject: [PATCH 137/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 447243316..60c73ff44 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.27 +v1.10.28 From 71b5454b9eb441b2d802c2a6a3be6c0be3f6a30c Mon Sep 17 00:00:00 2001 From: Qianqian Zhu Date: Wed, 26 Jan 2022 15:17:38 +0000 Subject: [PATCH 138/254] Layer documentation (#892) * More examples for MLP layers and docs about RNN layers * Docs about embedding layer and more doxygen code docs * Add layer and factors docs into index.rst * Update layer documentation * Fix typos Co-authored-by: Roman Grundkiewicz Co-authored-by: Graeme Nail --- doc/factors.md | 2 +- doc/index.rst | 3 +- doc/layer.md | 241 +++++++++++++++++++++++++++++++ doc/operators.md | 2 +- src/layers/constructors.h | 101 ++++++++++--- src/layers/embedding.h | 37 ++++- src/layers/factory.h | 19 ++- src/layers/generic.h | 62 ++++++-- src/rnn/attention_constructors.h | 1 + src/rnn/cells.h | 1 + src/rnn/constructors.h | 17 +++ 11 files changed, 446 insertions(+), 40 deletions(-) create mode 100644 doc/layer.md diff --git a/doc/factors.md b/doc/factors.md index 59e14b682..dbd953b96 100644 --- a/doc/factors.md +++ b/doc/factors.md @@ -1,4 +1,4 @@ -# Using marian with factors +# Using Marian with factors Following this README should allow the user to train a model with source and/or target side factors. To train with factors, the data must be formatted in a certain way. A special vocabulary file format is also required, and its extension should be `.fsv` as providing a source and/or target vocabulary file with this extension is what triggers the usage of source and/or target factors. See details below. diff --git a/doc/index.rst b/doc/index.rst index d0a4fefb4..a790e6247 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -14,7 +14,8 @@ This is developer documentation. User documentation is available at https://mari graph operators - + layer + factors api/library_index contributing diff --git a/doc/layer.md b/doc/layer.md new file mode 100644 index 000000000..295a31536 --- /dev/null +++ b/doc/layer.md @@ -0,0 +1,241 @@ +# Layers + +In a typical deep neural network, highest-level blocks, which perform different kinds of +transformations on their inputs are called layers. A layer wraps a group of nodes and performs a +specific mathematical computation, offering a shortcut for building a more complex neural network. + +In Marian, for example, the `mlp::dense` layer represents a fully connected layer, which implements +the operation `output = activation(input * weight + bias)`. A dense layer in the graph can be +constructed with the following code: +```cpp +// add input node x +auto x = graph->constant({120,5}, inits::fromVector(inputData)); +// construct a dense layer in the graph +auto layer1 = mlp::dense() + ("prefix", "layer1") // prefix name is layer1 + ("dim", 5) // output dimension is 5 + ("activation", (int)mlp::act::tanh) // activation function is tanh + .construct(graph)->apply(x); // construct this layer in graph + // and link node x as the input +``` +The options are passed to the layer using pairs of `(key, value)`, where `key` is a predefined +option, and `value` is the option value. Then `construct()` is called to create a layer instance in +the graph, and `apply()` to link the input with this layer. + +Alternatively, the same layer can be created defining nodes and operations directly: +```cpp +// construct a dense layer using nodes +auto W1 = graph->param("W1", {120, 5}, inits::glorotUniform()); +auto b1 = graph->param("b1", {1, 5}, inits::zeros()); +auto h = tanh(affine(x, W1, b1)); +``` +There are four categories of layers implemented in Marian, described in the sections below. + +## Convolution layer + +To use a `convolution` layer, you first need to install [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). +The convolution layer supported by Marian is a 2D +[convolution layer](https://en.wikipedia.org/wiki/Convolutional_neural_network#Convolutional_layers). +This layer creates a convolution kernel which is used to convolved with the input. The options that +can be passed to a `convolution` layer are the following: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| kernel-dims | The height and width of the kernel | `std::pair` | `None`| +| kernel-num | The number of kernel | `int` | `None` | +| paddings | The height and width of paddings | `std::pair` | `(0,0)`| +| strides | The height and width of strides | `std::pair` | `(1,1)` | + +Example: +```cpp +// construct a convolution layer +auto conv_1 = convolution(graph) // pass graph pointer to the layer + ("prefix", "conv_1") // prefix name is conv_1 + ("kernel-dims", std::make_pair(3,3)) // kernel is 3*3 + ("kernel-num", 32) // kernel no. is 32 + .apply(x); // link node x as the input +``` + +## MLP layers + +Marian offers `mlp::mlp`, which creates a +[multilayer perceptron (MLP)](https://en.wikipedia.org/wiki/Multilayer_perceptron) network. +It is a container which can stack multiple layers using `push_back()` function. There are two types +of MLP layers provided by Marian: `mlp::dense` and `mlp::output`. + +The `mlp::dense` layer, as introduced before, is a fully connected layer, and it accepts the +following options: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| dim | Output dimension | `int` | `None` | +| layer-normalization | Whether to normalise the layer output or not | `bool` | `false` | +| nematus-normalization | Whether to use Nematus layer normalisation or not | `bool` | `false` | +| activation | Activation function | `int` | `mlp::act::linear` | + +The available activation functions for mlp are `mlp::act::linear`, `mlp::act::tanh`, +`mlp::act::sigmoid`, `mlp::act::ReLU`, `mlp::act::LeakyReLU`, `mlp::act::PReLU`, and +`mlp::act::swish`. + +Example: +```cpp +// construct a mlp::dense layer +auto dense_layer = mlp::dense() + ("prefix", "dense_layer") // prefix name is dense_layer + ("dim", 3) // output dimension is 3 + ("activation", (int)mlp::act::sigmoid) // activation function is sigmoid + .construct(graph)->apply(x); // construct this layer in graph and link node x as the input +``` + +The `mlp::output` layer is used, as the name suggests, to construct an output layer. You can tie +embedding layers to `mlp::output` layer using `tieTransposed()`, or set shortlisted words using +`setShortlist()`. The general options of `mlp::output` layer are listed below: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| dim | Output dimension | `int` | `None` | +| vocab | File path to the factored vocabulary | `std::string` | `None` | +| output-omit-bias | Whether this layer has a bias parameter | `bool` | `true` | +| lemma-dim-emb | Re-embedding dimension of lemma in factors, must be used with `vocab` option | `int` | `0` | +| output-approx-knn | Parameters for LSH-based output approximation, i.e., `k` (the first element) and `nbit` (the second element) | `std::vector` | None | + +Example: +```cpp +// construct a mlp::output layer +auto last = mlp::output() + ("prefix", "last") // prefix name is dense_layer + ("dim", 5); // output dimension is 5 +``` +Finally, an example showing how to create a `mlp::mlp` network containing multiple layers: +```cpp +// construct a mlp::mlp network +auto mlp_networks = mlp::mlp() // construct a mpl container + .push_back(mlp::dense() // construct a dense layer + ("prefix", "dense") // prefix name is dense + ("dim", 5) // dimension is 5 + ("activation", (int)mlp::act::tanh))// activation function is tanh + .push_back(mlp::output() // construct a output layer + ("dim", 5)) // dimension is 5 + ("prefix", "mlp_network") // prefix name is mlp_network + .construct(graph); // construct this mlp layers in graph +``` + +## RNN layers +Marian offers `rnn::rnn` for creating a [recurrent neural network +(RNN)](https://en.wikipedia.org/wiki/Recurrent_neural_network) network. Just like `mlp::mlp`, +`rnn::rnn` is a container which can stack multiple layers using `push_back()` function. Unlike mlp +layers, Marian only provides cell-level APIs to construct RNN. RNN cells only process a single +timestep instead of the whole batches of input sequences. There are two types of rnn layers provided +by Marian: `rnn::cell` and `rnn::stacked_cell`. + +The `rnn::cell` is the base component of RNN and `rnn::stacked_cell` is a stack of `rnn::cell`. The +few options of `rnn::cell` layer are listed below: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| type | Type of RNN cell | `std::string` | `None` | + +There are nine types of RNN cells provided by Marian: `gru`, `gru-nematus`, `lstm`, `mlstm`, `mgru`, +`tanh`, `relu`, `sru`, `ssru`. The general options for all RNN cells are the following: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| dimInput | Input dimension | `int` | `None` | +| dimState | Dimension of hidden state | `int` | `None` | +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| layer-normalization | Whether to normalise the layer output or not | `bool` | `false` | +| dropout | Dropout probability | `float` | `0` | +| transition | Whether it is a transition layer | `bool` | `false` | +| final | Whether it is an RNN final layer or hidden layer | `bool` | `false` | + +```{note} +Not all the options listed above are available for all the cells. For example, `final` option is +only used for `gru` and `gru-nematus` cells. +``` + +Example for `rnn::cell`: +```cpp +// construct a rnn cell +auto rnn_cell = rnn::cell() + ("type", "gru") // type of rnn cell is gru + ("prefix", "gru_cell") // prefix name is gru_cell + ("final", false); // this cell is the final layer +``` +Example for `rnn::stacked_cell`: +```cpp +// construct a stack of rnn cells +auto highCell = rnn::stacked_cell(); +// for loop to add rnn cells into the stack +for(size_t j = 1; j <= 512; j++) { + auto paramPrefix ="cell" + std::to_string(j); + highCell.push_back(rnn::cell()("prefix", paramPrefix)); +} +``` + +The list of available options for `rnn::rnn` layers: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| type | Type of RNN layer | `std::string` | `gru` | +| direction | RNN direction | `int` | `rnn::dir::forward` | +| dimInput | Input dimension | `int` | `None` | +| dimState | Dimension of hidden state | `int` | `None` | +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| layer-normalization | Whether to normalise the layer output or not | `bool` | `false` | +| nematus-normalization | Whether to use Nematus layer normalisation or not | `bool` | `false` | +| dropout | Dropout probability | `float` | `0` | +| skip | Whether to use skip connections | `bool` | `false` | +| skipFirst | Whether to use skip connections for the layer(s) with `index > 0` | `bool` | `false` | + +Examples for `rnn::rnn()`: +```cpp +// construct a `rnn::rnn()` container +auto rnn_container = rnn::rnn( + "type", "gru", // type of rnn cell is gru + "prefix", "rnn_layers", // prefix name is rnn_layers + "dimInput", 10, // input dimension is 10 + "dimState", 5, // dimension of hidden state is 5 + "dropout", 0, // dropout probability is 0 + "layer-normalization", false) // do not normalise the layer output + .push_back(rnn::cell()) // add a rnn::cell in this rnn container + .construct(graph); // construct this rnn container in graph +``` +Marian provides four RNN directions in `rnn::dir` enumerator: `rnn::dir::forward`, +`rnn::dir::backward`, `rnn::dir::alternating_forward` and `rnn::dir::alternating_backward`. +For rnn::rnn(), you can use `transduce()` to map the input state to the output state. + +An example for `transduce()`: +```cpp +auto output = rnn.construct(graph)->transduce(input); +``` + +## Embedding layer +Marian provides a shortcut to construct a regular embedding layer `embedding` for words embedding. +For `embedding` layers, there are following options available: + +| Option Name | Definition | Value Type | Default Value | +| ------------- |----------------|---------------|---------------| +| dimVocab | Size of vocabulary| `int` | `None` | +| dimEmb | Size of embedding vector | `int` | `None` | +| dropout | Dropout probability | `float` | `0` | +| inference | Whether it is used for inference | `bool` | `false` | +| prefix | Prefix name (used to form the parameter names) | `std::string` | `None` | +| fixed | whether this layer is fixed (not trainable) | `bool` | `false` | +| dimFactorEmb | Size of factored embedding vector | `int` | `None` | +| factorsCombine | Which strategy is chosen to combine the factor embeddings; it can be `"concat"` | `std::string` | `None` | +| vocab | File path to the factored vocabulary | `std::string` | `None` | +| embFile | Paths to the factored embedding vectors | `std::string>` | `None` | +| normalization | Whether to normalise the layer output or not | `bool` | `false` | + +Example to construct an embedding layer: +```cpp +// construct an embedding layer +auto embedding_layer = embedding() + ("prefix", "embedding") // prefix name is embedding + ("dimVocab", 1024) // vocabulary size is 1024 + ("dimEmb", 512) // size of embedding vector is 512 + .construct(graph); // construct this embedding layer in graph +``` diff --git a/doc/operators.md b/doc/operators.md index 2cca391b7..1e7bba96e 100644 --- a/doc/operators.md +++ b/doc/operators.md @@ -1,4 +1,4 @@ -# Operations in the Expression Graph +# Operations in the expression graph Operations are responsible for manipulating the elements of an expression graph. In Marian, many useful operations have already been implemented and can be found diff --git a/src/layers/constructors.h b/src/layers/constructors.h index 9e9de2077..5597a6a4e 100644 --- a/src/layers/constructors.h +++ b/src/layers/constructors.h @@ -12,6 +12,11 @@ namespace mlp { * Base class for layer factories, can be used in a multi-layer network factory. */ struct LayerFactory : public Factory { + /** + * Construct a layer instance in a given graph. + * @param graph a shared pointer a graph + * @return a shared pointer to the layer object + */ virtual Ptr construct(Ptr graph) = 0; }; @@ -31,18 +36,24 @@ class DenseFactory : public LayerFactory { } }; -// @TODO: change naming convention +/** + * A convenient typedef for constructing a MLP dense layer. + * @TODO: change naming convention + */ typedef Accumulator dense; /** - * Factory for output layers, can be used in a multi-layer network factory. + * Base factory for output layers, can be used in a multi-layer network factory. */ struct LogitLayerFactory : public Factory { using Factory::Factory; virtual Ptr construct(Ptr graph) = 0; }; -// @TODO: In the long run, I hope we can get rid of the abstract factories altogether. +/** + * Implementation of Output layer factory, can be used in a multi-layer network factory. + * @TODO: In the long run, I hope we can get rid of the abstract factories altogether. + */ class OutputFactory : public LogitLayerFactory { using LogitLayerFactory::LogitLayerFactory; @@ -74,12 +85,13 @@ class OutputFactory : public LogitLayerFactory { } }; -// @TODO: change naming convention -typedef Accumulator output; - /** - * Multi-layer network, holds and applies layers. + * A convenient typedef for constructing a MLP output layer. + * @TODO: change naming convention */ +typedef Accumulator output; + +/** Multi-layer network, holds and applies layers. */ class MLP : public IUnaryLogitLayer, public IHasShortList { protected: Ptr graph_; @@ -88,8 +100,17 @@ class MLP : public IUnaryLogitLayer, public IHasShortList { std::vector> layers_; public: + /** + * Construct a MLP container in the graph. + * @param graph The expression graph. + * @param options The options used for this mlp container. + */ MLP(Ptr graph, Ptr options) : graph_(graph), options_(options) {} - + /** + * Apply/Link a vector of mlp layers (with the given inputs) to the expression graph. + * @param av The vector of input expressions + * @return The expression holding the mlp container + */ Expr apply(const std::vector& av) override { Expr output; if(av.size() == 1) @@ -102,7 +123,12 @@ class MLP : public IUnaryLogitLayer, public IHasShortList { return output; } - + /** + * Apply/Link a vector of mlp layers (with the given inputs) to the expression graph. + * @param av The vector of input expressions + * @return The expression holding the mlp container as a + * Logits object + */ Logits applyAsLogits(const std::vector& av) override { // same as apply() except for the last layer, we invoke applyAsLogits(), which has a different // return type @@ -126,13 +152,33 @@ class MLP : public IUnaryLogitLayer, public IHasShortList { return lastLayer->applyAsLogits(output); } } - + /** + * Apply/Link a mlp layer (with the given input) to the expression graph. + * @param e The input expression + * @return The expression holding the mlp container + */ Expr apply(Expr e) override { return apply(std::vector{e}); } + /** + * Apply/Link a mlp layer (with the given input) to the expression graph. + * @param e The input expression + * @return The expression holding the mlp container as a + * Logits object + */ Logits applyAsLogits(Expr e) override { return applyAsLogits(std::vector{e}); } - + /** + * Stack a mlp layer to the mlp container. + * @param layer The mlp layer + */ void push_back(Ptr layer) { layers_.push_back(layer); } + /** + * Stack a mlp layer with Logits object to the mlp container. + * @param layer The mlp layer with Logits object + */ void push_back(Ptr layer) { layers_.push_back(layer); } - + /** + * Set shortlisted words to the mlp container. + * @param shortlist The given shortlisted words + */ void setShortlist(Ptr shortlist) override final { auto p = tryAsHasShortlist(); ABORT_IF( @@ -140,7 +186,7 @@ class MLP : public IUnaryLogitLayer, public IHasShortList { "setShortlist() called on an MLP with an output layer that does not support short lists"); p->setShortlist(shortlist); } - + /** Remove shortlisted words from the mlp container. */ void clear() override final { auto p = tryAsHasShortlist(); if(p) @@ -154,8 +200,8 @@ class MLP : public IUnaryLogitLayer, public IHasShortList { }; /** - * Multi-layer network factory. Can hold layer factories. Used - * to accumulate options for later lazy construction. + * Multi-layer network factory. Can hold layer factories. + * Used to accumulate options for later lazy construction. */ class MLPFactory : public Factory { using Factory::Factory; @@ -164,6 +210,12 @@ class MLPFactory : public Factory { std::vector> layers_; public: + /** + * Create a MLP container instance in the expression graph. + * Used to accumulate options for later lazy construction. + * @param graph The expression graph + * @return The shared pointer to the MLP container + */ Ptr construct(Ptr graph) { auto mlp = New(graph, options_); for(auto layer : layers_) { @@ -172,7 +224,11 @@ class MLPFactory : public Factory { } return mlp; } - + /** + * Stack a layer to the mlp container. + * @param lf The layer + * @return The Accumulator object holding the mlp container + */ template Accumulator push_back(const LF& lf) { layers_.push_back(New(lf)); @@ -201,6 +257,11 @@ class MLPFactory : public Factory { } public: + /** + * Stack a mlp output layer to the mlp container. + * @param lf The mlp output layer + * @return The Accumulator object holding the mlp container + */ Accumulator push_back(const Accumulator& lf) { push_back(AsLayerFactory(lf)); // layers_.push_back(New>(asLayerFactory((OutputFactory&)lf))); @@ -208,13 +269,19 @@ class MLPFactory : public Factory { } }; -// @TODO: change naming convention. + +/** + * A convenient typedef for constructing MLP layers. + * @TODO: change naming convention. + */ typedef Accumulator mlp; } // namespace mlp typedef ConstructingFactory EmbeddingFactory; typedef ConstructingFactory ULREmbeddingFactory; +/** A convenient typedef for constructing a standard embedding layers. */ typedef Accumulator embedding; +/** A convenient typedef for constructing ULR word embedding layers. */ typedef Accumulator ulr_embedding; } // namespace marian diff --git a/src/layers/embedding.h b/src/layers/embedding.h index d34c7ffb9..af22b980a 100644 --- a/src/layers/embedding.h +++ b/src/layers/embedding.h @@ -6,10 +6,12 @@ namespace marian { class FactoredVocab; -// A regular embedding layer. -// Note that this also applies dropout if the option is passed (pass 0 when in inference mode). -// It is best to not use Embedding directly, but rather via getEmbeddingLayer() in -// EncoderDecoderLayerBase, which knows to pass on all required parameters from options. +/** + * A regular embedding layer. + * Note that this also applies dropout if the option is passed (pass 0 when in inference mode). + * It is best to not use Embedding directly, but rather via getEmbeddingLayer() in + * EncoderDecoderLayerBase, which knows to pass on all required parameters from options. + */ class Embedding : public LayerBase, public IEmbeddingLayer { Expr E_; Expr FactorEmbMatrix_; // Factors embedding matrix if combining lemma and factors embeddings with concatenation @@ -19,16 +21,43 @@ class Embedding : public LayerBase, public IEmbeddingLayer { bool inference_{false}; public: + /** + * Construct a regular embedding layer in the graph. + * @param graph The expression graph. + * @param options The options used for this embedding layer. + */ Embedding(Ptr graph, Ptr options); + /** + * Apply/Link this embedding layer (with the given batch of sentences) to the expression graph. + * @param subBatch The batch of sentences + * @return The expression tuple holding the embedding layer and the masking layer + */ std::tuple apply( Ptr subBatch) const override final; + /** + * Apply/Link this embedding layer (with the given words and shape) to the expression graph. + * @param words Sequence of vocabulary items + * @param shape Shape of the words + * @return The expression holding the embedding layer + */ Expr apply(const Words& words, const Shape& shape) const override final; + /** + * Apply/Link this embedding layer (with the given WordIndex vector and shape) to the expression graph. + * @param embIdx The vector of WordIndex objects + * @param shape Shape of the WordIndex vector + * @return The expression holding the embedding layer + */ Expr applyIndices(const std::vector& embIdx, const Shape& shape) const override final; }; +/** + * Universal Language Representation (ULR) word embedding layer. + * It is under development. + * @todo applyIndices() is not implemented + */ class ULREmbedding : public LayerBase, public IEmbeddingLayer { std::vector ulrEmbeddings_; // @TODO: These could now better be written as 6 named class members bool inference_{false}; diff --git a/src/layers/factory.h b/src/layers/factory.h index f9e4ddf92..df092199c 100644 --- a/src/layers/factory.h +++ b/src/layers/factory.h @@ -3,7 +3,10 @@ #include "marian.h" namespace marian { - +/** + * Base class for constructing models or layers. + * Its main attribute is options which hold the basic characteristics of the model or the layer. + */ class Factory : public std::enable_shared_from_this { protected: Ptr options_; @@ -68,8 +71,7 @@ class Factory : public std::enable_shared_from_this { template inline bool is() { return std::dynamic_pointer_cast(shared_from_this()) != nullptr; } }; - -// simplest form of Factory that just passes on options to the constructor of a layer type +/** Simplest form of Factory that just passes on options to the constructor of a layer. */ template struct ConstructingFactory : public Factory { using Factory::Factory; @@ -79,6 +81,17 @@ struct ConstructingFactory : public Factory { } }; +/** + * Accumulator pattern offers a shortcut to construct models or layers. + * The options can be passed by a pair of parentheses. E.g., to construct a fully-connected layer: + * \code{.cpp} + * auto hidden = mlp::dense() + ("prefix", "hidden_layer") // layer name + ("dim", outDim) // output dimension + ("activation", (int)mlp::act::sigmoid) // activation function + .construct(graph); // construct this layer in graph + \endcode + */ template // where BaseFactory : Factory class Accumulator : public BaseFactory { typedef BaseFactory Factory; diff --git a/src/layers/generic.h b/src/layers/generic.h index 9af033df5..b423befeb 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -9,18 +9,19 @@ namespace marian { namespace mlp { -/** - * @brief Activation functions - */ +/** Activation functions for MLP layers. */ enum struct act : int { linear, tanh, sigmoid, ReLU, LeakyReLU, PReLU, swish }; } // namespace mlp } // namespace marian namespace marian { -// Each layer consists of LayerBase and IXXXLayer which defines one or more apply() -// functions for the respective layer type (different layers may require different signatures). -// This base class contains configuration info for creating parameters and executing apply(). +/** + * Base class for a layer. + * Each layer consists of LayerBase and IXXXLayer which defines one or more apply() + * functions for the respective layer type (different layers may require different signatures). + * This base class contains configuration info for creating parameters and executing apply(). + */ class LayerBase { protected: Ptr graph_; @@ -40,22 +41,25 @@ class LayerBase { } }; -// Simplest layer interface: Unary function +/** Simplest layer interface: Unary function. */ struct IUnaryLayer { virtual ~IUnaryLayer() {} + /** Link a node as the input for this layer. */ virtual Expr apply(Expr) = 0; + /** Link a list of nodes as the inputs for this layer. */ virtual Expr apply(const std::vector& es) { ABORT_IF(es.size() > 1, "Not implemented"); // simple stub return apply(es.front()); } }; +/** Shortlist interface for layers. */ struct IHasShortList { virtual void setShortlist(Ptr shortlist) = 0; virtual void clear() = 0; }; -// Embedding from corpus sub-batch to (emb, mask) +/** Embedding from corpus sub-batch to (emb, mask). */ struct IEmbeddingLayer { virtual std::tuple apply( Ptr subBatch) const = 0; @@ -67,8 +71,10 @@ struct IEmbeddingLayer { virtual ~IEmbeddingLayer() {} }; -// base class for Encoder and Decoder classes, which have embeddings and a batch index (=stream -// index) +/** + * Base class for Encoder and Decoder classes. + * Have embeddings and a batch index (=stream index). + */ class EncoderDecoderLayerBase : public LayerBase { protected: const std::string prefix_; @@ -98,16 +104,42 @@ class EncoderDecoderLayerBase : public LayerBase { Ptr createULREmbeddingLayer() const; public: - // get embedding layer; lazily create on first call + /** + * Get all embedding layer(s). + * It lazily creates the embedding layer on first call. + * This is lazy mostly because the constructors of the consuming objects are not + * guaranteed presently to have access to their graph. + * @param ulr whether to use ULREmbedding layer. false by default. + * @return a shared pointer to the embedding layer + */ Ptr getEmbeddingLayer(bool ulr = false) const; }; +/** + * The namespace mlp. + * Declare class Dense and all the available functions for creating + * multilayer perceptron (MLP) + * network. + */ namespace mlp { +/** + * Base class for a fully connected layer. + * Implement the operations `output = activation(input * weight + bias)`. + */ class Dense : public LayerBase, public IUnaryLayer { public: + /** + * Construct a dense layer in the graph. + * @param graph The expression graph. + * @param options The options used for this dense layer. + */ Dense(Ptr graph, Ptr options) : LayerBase(graph, options) {} - + /** + * Apply/Link a vector of dense layers (with the given inputs) to the expression graph. + * @param inputs The vector of the input expressions + * @return The expression holding the dense layers + */ Expr apply(const std::vector& inputs) override { ABORT_IF(inputs.empty(), "No inputs"); @@ -161,7 +193,11 @@ class Dense : public LayerBase, public IUnaryLayer { } // clang-format on }; - + /** + * Apply/Link this dense layer (with the given input) to the expression graph. + * @param input The input expression + * @return The expression holding the dense layer + */ Expr apply(Expr input) override { return apply(std::vector({input})); } }; diff --git a/src/rnn/attention_constructors.h b/src/rnn/attention_constructors.h index a878f57f6..4ad1975e7 100644 --- a/src/rnn/attention_constructors.h +++ b/src/rnn/attention_constructors.h @@ -33,6 +33,7 @@ class AttentionFactory : public InputFactory { } }; +/** A convenient typedef for constructing RNN attention layers. */ typedef Accumulator attention; } // namespace rnn } // namespace marian diff --git a/src/rnn/cells.h b/src/rnn/cells.h index cddfd26e6..18ac4d1dc 100644 --- a/src/rnn/cells.h +++ b/src/rnn/cells.h @@ -197,6 +197,7 @@ class ReLU : public Cell { Expr gruOps(const std::vector& nodes, bool final = false); +/** Base class for a gated recurrent unit (GRU) cell. */ class GRU : public Cell { protected: std::string prefix_; diff --git a/src/rnn/constructors.h b/src/rnn/constructors.h index beb1fce11..22acfe9e7 100644 --- a/src/rnn/constructors.h +++ b/src/rnn/constructors.h @@ -5,6 +5,12 @@ #include "rnn/rnn.h" namespace marian { +/** + * The namespace rnn. + * Declare class Dense and all the available functions for creating + * recurrent neural network (RNN) + * network. + */ namespace rnn { typedef Factory StackableFactory; @@ -28,6 +34,12 @@ struct InputFactory : public StackableFactory { virtual Ptr construct(Ptr graph) = 0; }; +/** + * Base class for constructing RNN cells. + * RNN cells only process a single timestep instead of the whole batches of input sequences. + * There are nine types of RNN cells provided by Marian, i.e., `gru`, `gru-nematus`, `lstm`, + * `mlstm`, `mgru`, `tanh`, `relu`, `sru`, `ssru`. + */ class CellFactory : public StackableFactory { protected: std::vector)>> inputs_; @@ -92,8 +104,10 @@ class CellFactory : public StackableFactory { } }; +/** A convenience typedef for constructing RNN cells. */ typedef Accumulator cell; +/** Base class for constructing a stack of RNN cells (`rnn::cell`). */ class StackedCellFactory : public CellFactory { protected: std::vector> stackableFactories_; @@ -137,8 +151,10 @@ class StackedCellFactory : public CellFactory { } }; +/** A convenience typedef for constructing a stack of RNN cells. */ typedef Accumulator stacked_cell; +/** Base class for constructing RNN layers. */ class RNNFactory : public Factory { using Factory::Factory; protected: @@ -195,6 +211,7 @@ class RNNFactory : public Factory { } }; +/** A convenience typedef for constructing RNN containers/layers. */ typedef Accumulator rnn; } // namespace rnn } // namespace marian From 07c39c7d76587c439379a2ff4c208bad956cff87 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 28 Jan 2022 14:16:41 +0000 Subject: [PATCH 139/254] Cherry picked cleaning/refeactoring patches (#905) Cherry-picked updates from pull request #457 Co-authored-by: Mateusz Chudyk --- src/common/cli_wrapper.cpp | 13 ++++++-- src/common/cli_wrapper.h | 50 ++++++++++++++---------------- src/common/config.h | 2 +- src/common/config_parser.cpp | 26 ++++++++-------- src/common/definitions.h | 2 +- src/common/logging.cpp | 4 +-- src/common/logging.h | 12 +++---- src/common/options.h | 2 +- src/data/corpus_base.h | 6 ++-- src/data/dataset.h | 2 +- src/data/vocab.h | 2 +- src/graph/expression_graph.h | 34 ++++++++++---------- src/graph/expression_operators.cpp | 18 +++++------ src/layers/convolution.h | 4 +-- src/layers/loss.h | 4 +-- src/layers/word2vec_reader.h | 2 +- src/models/bert.h | 4 +-- src/models/encoder_classifier.h | 2 +- src/models/encoder_decoder.cpp | 2 +- src/models/transformer.h | 2 +- src/training/communicator.cpp | 4 +-- src/training/graph_group.h | 24 +++++++------- 22 files changed, 113 insertions(+), 108 deletions(-) diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp index 211dd0b92..fee50a2cb 100644 --- a/src/common/cli_wrapper.cpp +++ b/src/common/cli_wrapper.cpp @@ -113,10 +113,10 @@ std::string CLIWrapper::switchGroup(std::string name) { return name; } -void CLIWrapper::parse(int argc, char **argv) { +void CLIWrapper::parse(int argc, char** argv) { try { app_->parse(argc, argv); - } catch(const CLI::ParseError &e) { + } catch(const CLI::ParseError& e) { exit(app_->exit(e)); } @@ -182,6 +182,13 @@ void CLIWrapper::parseAliases() { } } +std::string CLIWrapper::keyName(const std::string& args) const { + // re-use existing functions from CLI11 to keep option names consistent + return std::get<1>( + CLI::detail::get_names(CLI::detail::split_names(args))) // get long names only + .front(); // get first long name +} + void CLIWrapper::updateConfig(const YAML::Node &config, cli::OptionPriority priority, const std::string &errorMsg) { auto cmdOptions = getParsedOptionNames(); // Keep track of unrecognized options from the provided config @@ -276,7 +283,7 @@ std::vector CLIWrapper::getOrderedOptionNames() const { for(auto const &it : options_) keys.push_back(it.first); // sort option names by creation index - sort(keys.begin(), keys.end(), [this](const std::string &a, const std::string &b) { + sort(keys.begin(), keys.end(), [this](const std::string& a, const std::string& b) { return options_.at(a).idx < options_.at(b).idx; }); return keys; diff --git a/src/common/cli_wrapper.h b/src/common/cli_wrapper.h index 349d353ba..da8ebd6da 100644 --- a/src/common/cli_wrapper.h +++ b/src/common/cli_wrapper.h @@ -44,7 +44,7 @@ struct CLIAliasTuple { class CLIFormatter : public CLI::Formatter { public: CLIFormatter(size_t columnWidth, size_t screenWidth); - virtual std::string make_option_desc(const CLI::Option *) const override; + virtual std::string make_option_desc(const CLI::Option*) const override; private: size_t screenWidth_{0}; @@ -85,12 +85,7 @@ class CLIWrapper { // Extract option name from a comma-separated list of long and short options, e.g. 'help' from // '--help,-h' - std::string keyName(const std::string &args) const { - // re-use existing functions from CLI11 to keep option names consistent - return std::get<1>( - CLI::detail::get_names(CLI::detail::split_names(args))) // get long names only - .front(); // get first long name - } + std::string keyName(const std::string &args) const; // Get names of options passed via command-line std::unordered_set getParsedOptionNames() const; @@ -134,7 +129,7 @@ class CLIWrapper { * @return Option object */ template - CLI::Option *add(const std::string &args, const std::string &help, T val) { + CLI::Option* add(const std::string& args, const std::string& help, T val) { return addOption(keyName(args), args, help, @@ -159,7 +154,7 @@ class CLIWrapper { * @TODO: require to always state the default value creating the parser as this will be clearer */ template - CLI::Option *add(const std::string &args, const std::string &help) { + CLI::Option* add(const std::string& args, const std::string& help) { return addOption(keyName(args), args, help, @@ -206,7 +201,7 @@ class CLIWrapper { std::string switchGroup(std::string name = ""); // Parse command-line arguments. Handles --help and --version options - void parse(int argc, char **argv); + void parse(int argc, char** argv); /** * @brief Expand aliases based on arguments parsed with parse(int, char**) @@ -240,11 +235,12 @@ class CLIWrapper { std::string dumpConfig(bool skipUnmodified = false) const; private: - template ::value && !CLI::is_vector::value, - CLI::detail::enabler> = CLI::detail::dummy> - CLI::Option *addOption(const std::string &key, + template + using EnableIfNumbericOrString = CLI::enable_if_t::value + && !CLI::is_vector::value, CLI::detail::enabler>; + + template = CLI::detail::dummy> + CLI::Option* addOption(const std::string &key, const std::string &args, const std::string &help, T val, @@ -261,7 +257,7 @@ class CLIWrapper { CLI::callback_t fun = [this, key](CLI::results_t res) { options_[key].priority = cli::OptionPriority::CommandLine; // get variable associated with the option - auto &var = options_[key].var->as(); + auto& var = options_[key].var->as(); // store parser result in var auto ret = CLI::detail::lexical_cast(res[0], var); // update YAML entry @@ -288,10 +284,11 @@ class CLIWrapper { return options_[key].opt; } - template ::value, CLI::detail::enabler> = CLI::detail::dummy> - CLI::Option *addOption(const std::string &key, + template + using EnableIfVector = CLI::enable_if_t::value, CLI::detail::enabler>; + + template = CLI::detail::dummy> + CLI::Option* addOption(const std::string &key, const std::string &args, const std::string &help, T val, @@ -308,7 +305,7 @@ class CLIWrapper { CLI::callback_t fun = [this, key](CLI::results_t res) { options_[key].priority = cli::OptionPriority::CommandLine; // get vector variable associated with the option - auto &vec = options_[key].var->as(); + auto& vec = options_[key].var->as(); vec.clear(); bool ret = true; // handle '[]' as an empty vector @@ -316,7 +313,7 @@ class CLIWrapper { ret = true; } else { // populate the vector with parser results - for(const auto &a : res) { + for(const auto& a : res) { vec.emplace_back(); ret &= CLI::detail::lexical_cast(a, vec.back()); } @@ -345,10 +342,11 @@ class CLIWrapper { return options_[key].opt; } - template ::value, CLI::detail::enabler> = CLI::detail::dummy> - CLI::Option *addOption(const std::string &key, + template + using EnableIfBoolean = CLI::enable_if_t::value, CLI::detail::enabler>; + + template = CLI::detail::dummy> + CLI::Option* addOption(const std::string &key, const std::string &args, const std::string &help, T val, diff --git a/src/common/config.h b/src/common/config.h index 255c50add..c5a016e68 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -107,7 +107,7 @@ class Config { * @param mode change the set of available command-line options, e.g. training, translation, etc. * @param validate validate parsed options and abort on failure * - * @return parsed otions + * @return parsed options */ Ptr parseOptions(int argc, char** argv, diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 9705d5b7a..333d87a7a 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -119,10 +119,10 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { cli.add>("--config,-c", "Configuration file(s). If multiple, later overrides earlier"); cli.add("--workspace,-w", - "Preallocate arg MB of work space", + "Preallocate arg MB of work space", defaultWorkspace); cli.add("--log", - "Log training process information to file given by arg"); + "Log training process information to file given by arg"); cli.add("--log-level", "Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off", "info"); @@ -392,17 +392,17 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Finish after this many chosen training units, 0 is infinity (e.g. 100e = 100 epochs, 10Gt = 10 billion target labels, 100Ku = 100,000 updates", "0e"); cli.add("--disp-freq", - "Display information every arg updates (append 't' for every arg target labels)", + "Display information every arg updates (append 't' for every arg target labels)", "1000u"); cli.add("--disp-first", - "Display information for the first arg updates"); + "Display information for the first arg updates"); cli.add("--disp-label-counts", "Display label counts when logging loss progress", true); // cli.add("--disp-label-index", // "Display label counts based on i-th input stream (-1 is last)", -1); cli.add("--save-freq", - "Save model file every arg updates (append 't' for every arg target labels)", + "Save model file every arg updates (append 't' for every arg target labels)", "10000u"); cli.add>("--logical-epoch", "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). " @@ -473,12 +473,12 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add("--lr-decay-repeat-warmup", "Repeat learning rate warmup when learning rate is decayed"); cli.add>("--lr-decay-inv-sqrt", - "Decrease learning rate at arg / sqrt(no. batches) starting at arg (append 't' or 'e' for sqrt(target labels or epochs)). " + "Decrease learning rate at arg / sqrt(no. batches) starting at arg (append 't' or 'e' for sqrt(target labels or epochs)). " "Add second argument to define the starting point (default: same as first value)", {"0"}); cli.add("--lr-warmup", - "Increase learning rate linearly for arg first batches (append 't' for arg first target labels)", + "Increase learning rate linearly for arg first batches (append 't' for arg first target labels)", "0"); cli.add("--lr-warmup-start-rate", "Start value for learning rate warmup"); @@ -492,7 +492,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add("--factor-weight", "Weight for loss function for factors (factored vocab only) (1 to disable)", 1.0f); cli.add("--clip-norm", - "Clip gradient norm to arg (0 to disable)", + "Clip gradient norm to arg (0 to disable)", 1.f); // @TODO: this is currently wrong with ce-sum and should rather be disabled or fixed by multiplying with labels cli.add("--exponential-smoothing", "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. " @@ -575,7 +575,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { cli.add>("--valid-sets", "Paths to validation corpora: source target"); cli.add("--valid-freq", - "Validate model every arg updates (append 't' for every arg target labels)", + "Validate model every arg updates (append 't' for every arg target labels)", "10000u"); cli.add>("--valid-metrics", "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, " @@ -585,7 +585,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { cli.add("--valid-reset-stalled", "Reset all stalled validation metrics when the training is restarted"); cli.add("--early-stopping", - "Stop if the first validation metric does not improve for arg consecutive validation steps", + "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); cli.add("--early-stopping-on", "Decide if early stopping should take into account first, all, or any validation metrics" @@ -637,7 +637,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { cli.add("--keep-best", "Keep best model for each validation metric"); cli.add("--valid-log", - "Log validation scores to file given by arg"); + "Log validation scores to file given by arg"); cli.switchGroup(previous_group); // clang-format on } @@ -942,10 +942,10 @@ void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) { cli.add("--ulr-query-vectors", "Path to file with universal sources embeddings from projection into universal space", ""); - // keys: EK in Fig2 : is the keys of the target embbedings projected to unified space (i.e. ENU in + // keys: EK in Fig2 : is the keys of the target embeddings projected to unified space (i.e. ENU in // multi-lingual case) cli.add("--ulr-keys-vectors", - "Path to file with universal sources embeddings of traget keys from projection into universal space", + "Path to file with universal sources embeddings of target keys from projection into universal space", ""); cli.add("--ulr-trainable-transformation", "Make Query Transformation Matrix A trainable"); diff --git a/src/common/definitions.h b/src/common/definitions.h index d2cf8aa41..159791d09 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -10,7 +10,7 @@ #include #include -#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur. +#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is necessary, remove if no problems occur. #define NodeOp(op) [=]() { op; } // helper macro to disable optimization (gcc only) diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 6ecc4099a..f77a41df6 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -136,11 +136,11 @@ static void setErrorHandlers() { // modify the log pattern for the "general" logger to include the MPI rank // This is called upon initializing MPI. It is needed to associated error messages to ranks. -void switchtoMultinodeLogging(std::string nodeIdStr) { +void switchToMultinodeLogging(std::string nodeIdStr) { Logger log = spdlog::get("general"); if(log) log->set_pattern(fmt::format("[%Y-%m-%d %T mpi:{}] %v", nodeIdStr)); - + Logger valid = spdlog::get("valid"); if(valid) valid->set_pattern(fmt::format("[%Y-%m-%d %T mpi:{}] [valid] %v", nodeIdStr)); diff --git a/src/common/logging.h b/src/common/logging.h index 855bda90d..b350d6039 100644 --- a/src/common/logging.h +++ b/src/common/logging.h @@ -12,19 +12,19 @@ namespace marian { std::string getCallStack(size_t skipLevels); // Marian gives a basic exception guarantee. If you catch a - // MarianRuntimeError you must assume that the object can be + // MarianRuntimeError you must assume that the object can be // safely destructed, but cannot be used otherwise. - // Internal multi-threading in exception-throwing mode is not + // Internal multi-threading in exception-throwing mode is not // allowed; and constructing a thread-pool will cause an exception. - + class MarianRuntimeException : public std::runtime_error { private: std::string callStack_; public: - MarianRuntimeException(const std::string& message, const std::string& callStack) - : std::runtime_error(message), + MarianRuntimeException(const std::string& message, const std::string& callStack) + : std::runtime_error(message), callStack_(callStack) {} const char* getCallStack() const throw() { @@ -178,4 +178,4 @@ void checkedLog(std::string logger, std::string level, Args... args) { } void createLoggers(const marian::Config* options = nullptr); -void switchtoMultinodeLogging(std::string nodeIdStr); +void switchToMultinodeLogging(std::string nodeIdStr); diff --git a/src/common/options.h b/src/common/options.h index 08c6a3ca9..992be8760 100644 --- a/src/common/options.h +++ b/src/common/options.h @@ -98,7 +98,7 @@ class Options { * @brief Splice options from a YAML node * * By default, only options with keys that do not already exist in options_ are extracted from - * node. These options are cloned if overwirte is true. + * node. These options are cloned if overwrite is true. * * @param node a YAML node to transfer the options from * @param overwrite overwrite all options diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 63a6fb990..d504a7ea3 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -379,7 +379,7 @@ class CorpusBatch : public Batch { * @see marian::data::SubBatch::split(size_t n) */ std::vector> split(size_t n, size_t sizeLimit /*=SIZE_MAX*/) override { - ABORT_IF(size() == 0, "Encoutered batch size of 0"); + ABORT_IF(size() == 0, "Encountered batch size of 0"); std::vector>> subs; // [subBatchIndex][streamIndex] // split each stream separately @@ -523,8 +523,8 @@ class CorpusBase : public DatasetBase options, - bool translate = false, + CorpusBase(Ptr options, + bool translate = false, size_t seed = Config::seed); CorpusBase(const std::vector& paths, diff --git a/src/data/dataset.h b/src/data/dataset.h index 881126a34..3cdccec99 100644 --- a/src/data/dataset.h +++ b/src/data/dataset.h @@ -44,7 +44,7 @@ class DatasetBase { virtual void prepare() {} virtual void restore(Ptr) {} - // @TODO: remove after cleaning traininig/training.h + // @TODO: remove after cleaning training/training.h virtual Ptr options() { return options_; } }; diff --git a/src/data/vocab.h b/src/data/vocab.h index 4af82e8e8..7eeca2902 100644 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -10,7 +10,7 @@ namespace marian { class IVocab; // Wrapper around vocabulary types. Can choose underlying -// vocabulary implementation (vImpl_) based on speficied path +// vocabulary implementation (vImpl_) based on specified path // and suffix. // Vocabulary implementations can currently be: // * DefaultVocabulary for YAML (*.yml and *.yaml) and TXT (any other non-specific ending) diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index c532abffd..7e2a57040 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -76,7 +76,7 @@ class Tensors { Ptr getAllocator() { return tensors_->allocator(); } Ptr getTensorAllocator() { return tensors_; } - + Expr findOrRemember(Expr node) { size_t hash = node->hash(); // memoize constant nodes that are not parameters @@ -359,9 +359,9 @@ class ExpressionGraph : public std::enable_shared_from_this { // Find the named parameter and its typed parent parameter object (params) and return both. // If the parameter is not found return the parent parameter object that the parameter should be added to. - // Return [nullptr, nullptr] if no matching parent parameter object exists. - std::tuple> findParams(const std::string& name, - Type elementType, + // Return [nullptr, nullptr] if no matching parent parameter object exists. + std::tuple> findParams(const std::string& name, + Type elementType, bool typeSpecified) const { Expr p; Ptr params; if(typeSpecified) { // type has been specified, so we are only allowed to look for a parameter with that type @@ -373,12 +373,12 @@ class ExpressionGraph : public std::enable_shared_from_this { } else { // type has not been specified, so we take any type as long as the name matches for(auto kvParams : paramsByElementType_) { p = kvParams.second->get(name); - + if(p) { // p has been found, return with matching params object params = kvParams.second; break; } - + if(kvParams.first == elementType) // even if p has not been found, set the params object to be returned params = kvParams.second; } @@ -399,8 +399,8 @@ class ExpressionGraph : public std::enable_shared_from_this { Expr p; Ptr params; std::tie (p, params) = findParams(name, elementType, typeSpecified); - - if(!params) { + + if(!params) { params = New(elementType); params->init(backend_); paramsByElementType_.insert({elementType, params}); @@ -632,13 +632,13 @@ class ExpressionGraph : public std::enable_shared_from_this { * Return the Parameters object related to the graph. * The Parameters object holds the whole set of the parameter nodes. */ - Ptr& params() { + Ptr& params() { // There are no parameter objects, that's weird. ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created"); - + // Safeguard against accessing parameters from the outside with multiple parameter types, not yet supported ABORT_IF(paramsByElementType_.size() > 1, "Calling of params() is currently not supported with multiple ({}) parameters", paramsByElementType_.size()); - + // Safeguard against accessing parameters from the outside with other than default parameter type, not yet supported auto it = paramsByElementType_.find(defaultElementType_); ABORT_IF(it == paramsByElementType_.end(), "Parameter object for type {} does not exist", defaultElementType_); @@ -650,7 +650,7 @@ class ExpressionGraph : public std::enable_shared_from_this { * Return the Parameters object related to the graph by elementType. * The Parameters object holds the whole set of the parameter nodes of the given type. */ - Ptr& params(Type elementType) { + Ptr& params(Type elementType) { auto it = paramsByElementType_.find(elementType); ABORT_IF(it == paramsByElementType_.end(), "Parameter object for type {} does not exist", defaultElementType_); return it->second; @@ -661,8 +661,8 @@ class ExpressionGraph : public std::enable_shared_from_this { * The default value is used if some node type is not specified. */ void setDefaultElementType(Type defaultElementType) { - ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_, - "Parameter objects already exist, cannot change default type from {} to {}", + ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_, + "Parameter objects already exist, cannot change default type from {} to {}", defaultElementType_, defaultElementType); defaultElementType_ = defaultElementType; } @@ -746,7 +746,7 @@ class ExpressionGraph : public std::enable_shared_from_this { // skip over special parameters starting with "special:" if(pName.substr(0, 8) == "special:") continue; - + // if during loading the loaded type is of the same type class as the default element type, allow conversion; // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both // have type class TypeClass::float_type. @@ -781,9 +781,9 @@ class ExpressionGraph : public std::enable_shared_from_this { LOG(info, "Memory mapping model at {}", ptr); auto items = io::mmapItems(ptr); - + // Deal with default parameter set object that might not be a mapped object. - // This gets assigned during ExpressionGraph::setDevice(...) and by default + // This gets assigned during ExpressionGraph::setDevice(...) and by default // would contain allocated tensors. Here we replace it with a mmapped version. auto it = paramsByElementType_.find(defaultElementType_); if(it != paramsByElementType_.end()) { diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 560ab4e73..322a29ad0 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -27,12 +27,12 @@ Expr checkpoint(Expr a) { return a; } -Expr lambda(const std::vector& nodes, Shape shape, Type type, +Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, size_t hash) { return Expression(nodes, shape, type, fwd, hash); } -Expr lambda(const std::vector& nodes, Shape shape, Type type, +Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, LambdaNodeFunctor bwd, size_t hash) { return Expression(nodes, shape, type, fwd, bwd, hash); } @@ -436,7 +436,7 @@ Expr std(Expr a, int ax) { return Expression(a - mean(a, ax), ax, ReduceNodeOpCode::rms); } -Expr var(Expr a, int ax) { +Expr var(Expr a, int ax) { if(a->shape()[ax] == 1) // nothing to reduce, var(a) = 0 return a - a; return Expression(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr); @@ -575,8 +575,8 @@ Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float sc return Expression(nodes, transA, transB, scale); } -// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future. -// The last branch with auto-tuner is: +// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future. +// The last branch with auto-tuner is: // youki/packed-model-pr-backup1031 // https://machinetranslation.visualstudio.com/Marian/_git/marian-dev?version=GByouki%2Fpacked-model-pr-backup1031 // SHA: 3456a7ed1d1608cfad74cd2c414e7e8fe141aa52 @@ -660,8 +660,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { } } else { // Default GEMM - ABORT_IF(!isFloat(aElementType) || !isFloat(bElementType), - "GPU-based GEMM only supports float types, you have A: {} and B: {}", + ABORT_IF(!isFloat(aElementType) || !isFloat(bElementType), + "GPU-based GEMM only supports float types, you have A: {} and B: {}", aElementType, bElementType); return affineDefault(a, b, bias, transA, transB, scale); } @@ -669,7 +669,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { auto graph = a->graph(); - + if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu) return Expression(a, b, bias, transA, transB, scale); else @@ -775,7 +775,7 @@ Expr unlikelihood(Expr logits, Expr indices) { int dimBatch = logits->shape()[-2]; int dimTime = logits->shape()[-3]; - // @TODO: fix this outside of this function in decoder.h etc. + // @TODO: fix this outside of this function in decoder.h etc. auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1}); // This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy diff --git a/src/layers/convolution.h b/src/layers/convolution.h index c6024f639..463419e82 100644 --- a/src/layers/convolution.h +++ b/src/layers/convolution.h @@ -70,9 +70,9 @@ class CharConvPooling { outputs.push_back(output2); } - auto concated = concatenate(outputs, -1); + auto concatenated = concatenate(outputs, -1); - return concated; + return concatenated; } protected: diff --git a/src/layers/loss.h b/src/layers/loss.h index c662f9911..5dbb5e553 100644 --- a/src/layers/loss.h +++ b/src/layers/loss.h @@ -67,7 +67,7 @@ class RationalLoss { return count_->val()->scalar(); } - // @TODO: add a funtion for returning maybe ratio? + // @TODO: add a function for returning maybe ratio? size_t size() const { ABORT_IF(!count_, "Labels have not been defined"); @@ -189,7 +189,7 @@ class SumMultiRationalLoss : public MultiRationalLoss { * * L = sum_i^N L_i + N/M sum_j^M L_j * - * We set labels to N. When reporting L/N this is equvalient to sum of means. + * We set labels to N. When reporting L/N this is equivalent to sum of means. * Compare to sum of means below where N is factored into the loss, but labels * are set to 1. */ diff --git a/src/layers/word2vec_reader.h b/src/layers/word2vec_reader.h index 4bfc67091..c76d3a9b6 100644 --- a/src/layers/word2vec_reader.h +++ b/src/layers/word2vec_reader.h @@ -76,7 +76,7 @@ class Word2VecReader { float scale = sqrtf(2.0f / (dimVoc + dimEmb)); // @TODO: switch to new random generator back-end. - // This is rarly used however. + // This is rarely used however. std::random_device rd; std::mt19937 engine(rd()); diff --git a/src/models/bert.h b/src/models/bert.h index 514274572..99dfae55e 100644 --- a/src/models/bert.h +++ b/src/models/bert.h @@ -8,7 +8,7 @@ namespace marian { /** - * This file contains nearly all BERT-related code and adds BERT-funtionality + * This file contains nearly all BERT-related code and adds BERT-functionality * on top of existing classes like TansformerEncoder and Classifier. */ @@ -82,7 +82,7 @@ class BertBatch : public CorpusBatch { // Initialize to sample random vocab id randomWord_.reset(new std::uniform_int_distribution(0, (WordIndex)vocab.size())); - // Intialize to sample random percentage + // Initialize to sample random percentage randomPercent_.reset(new std::uniform_real_distribution(0.f, 1.f)); auto& words = subBatch->data(); diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h index 5c8ddb5a2..bb8d28564 100644 --- a/src/models/encoder_classifier.h +++ b/src/models/encoder_classifier.h @@ -14,7 +14,7 @@ namespace marian { * Can be used to train sequence classifiers like language detection, BERT-next-sentence-prediction etc. * Already has support for multi-objective training. * - * @TODO: this should probably be unified somehow with EncoderDecoder which could allow for deocder/classifier + * @TODO: this should probably be unified somehow with EncoderDecoder which could allow for decoder/classifier * multi-objective training. */ class EncoderClassifierBase : public models::IModel { diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index bb938ee55..5711ea1b8 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -220,7 +220,7 @@ Ptr EncoderDecoder::stepAll(Ptr graph, if(clearGraph) clear(graph); - // Required first step, also intializes shortlist + // Required first step, also initializes shortlist auto state = startState(graph, batch); // Fill state with embeddings from batch (ground truth) diff --git a/src/models/transformer.h b/src/models/transformer.h index af877600e..ec68b801a 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -70,7 +70,7 @@ class Transformer : public EncoderOrDecoderBase { // Hack for translating with length longer than trained embeddings // We check if the embedding matrix "Wpos" already exist so we can // check the number of positions in that loaded parameter. - // We then have to restict the maximum length to the maximum positon + // We then have to restrict the maximum length to the maximum positon // and positions beyond this will be the maximum position. Expr seenEmb = graph_->get("Wpos"); int numPos = seenEmb ? seenEmb->shape()[-2] : maxLength; diff --git a/src/training/communicator.cpp b/src/training/communicator.cpp index 55d4991bc..602f7daa7 100644 --- a/src/training/communicator.cpp +++ b/src/training/communicator.cpp @@ -101,7 +101,7 @@ class MPIWrapper : public IMPIWrapper std::string maxRankStr = std::to_string(MPIWrapper::numMPIProcesses() -1); while (rankStr.size() < maxRankStr.size()) // pad so that logs across MPI processes line up nicely rankStr.insert(rankStr.begin(), ' '); - switchtoMultinodeLogging(rankStr); + switchToMultinodeLogging(rankStr); } // log hostnames in order, and test @@ -261,7 +261,7 @@ void finalizeMPI(Ptr&& mpi) { ABORT_IF(mpi == nullptr || mpi != s_mpi, "attempted to finalize an inconsistent MPI instance. This should not be possible."); mpi = nullptr; // destruct caller's handle ABORT_IF(s_mpiUseCount == 0, "finalize called too many times. This should not be possible."); - if (s_mpiUseCount == 1) { // last call finalizes MPI, i.e. tells MPI that we sucessfully completed computation + if (s_mpiUseCount == 1) { // last call finalizes MPI, i.e. tells MPI that we successfully completed computation ABORT_IF(s_mpi.use_count() != 1, "dangling reference to MPI??"); // caller kept another shared_ptr to this instance s_mpi->finalize(); // signal successful completion to MPI s_mpi = nullptr; // release the singleton instance upon last finalization diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 422990b16..0e4a68dcc 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -13,7 +13,7 @@ namespace marian { // With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here. static inline bool isFinite(float x) { -#ifdef __GNUC__ +#ifdef __GNUC__ ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option."); #endif return std::isfinite(x); @@ -27,7 +27,7 @@ static inline bool isFinite(float x) { // if one value is nonfinite propagate Nan into the reduction. static inline void accNanOrNorm(float& lhs, float rhs) { if(isFinite(lhs) && isFinite(rhs)) { - lhs = sqrtf(lhs * lhs + rhs * rhs); + lhs = sqrtf(lhs * lhs + rhs * rhs); } else lhs = std::numeric_limits::quiet_NaN(); } @@ -42,20 +42,20 @@ static inline void accNanOrNorm(float& lhs, float rhs) { class GraphGroup { protected: Ptr options_; - + Ptr comm_; // [not null] communicator, e.g. NCCLCommunicator Ptr mpi_; // [not null] all MPI-like communication goes through this (this is a dummy implementation if no MPI run) std::vector devices_; // [deviceIndex] - ShardingMode shardingMode_{ShardingMode::global}; // If local and multi-node training, shard only on local devices and do full sync (faster). If global shard across entire set of GPUs (more RAM). - + ShardingMode shardingMode_{ShardingMode::global}; // If local and multi-node training, shard only on local devices and do full sync (faster). If global shard across entire set of GPUs (more RAM). + // common for all graph groups, individual graph groups decide how to fill them std::vector> graphs_; // [deviceIndex] std::vector> models_; // [deviceIndex] std::vector> optimizerShards_; // [deviceIndex] Ptr scheduler_; // scheduler that keeps track of how much has been processed - + bool finalized_{false}; // 'true' if training has completed (further updates are no longer allowed) double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false @@ -100,16 +100,16 @@ class GraphGroup { virtual void load(); virtual void save(bool isFinal = false); - + private: void load(const OptimizerBase::ScatterStateFunc& scatterFn); void save(bool isFinal, const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn); - bool restoreFromCheckpoint(const std::string& modelFileName, + bool restoreFromCheckpoint(const std::string& modelFileName, const OptimizerBase::ScatterStateFunc& scatterFn); - void saveCheckpoint(const std::string& modelFileName, + void saveCheckpoint(const std::string& modelFileName, const OptimizerBase::GatherStateFunc& gatherFn); public: @@ -128,11 +128,11 @@ class GraphGroup { float executeAndCollectNorm(const std::function& task); float computeNormalizationFactor(float gNorm, size_t updateTrgWords); - + /** * Determine maximal batch size that can fit into the given workspace * so that reallocation does not happen. Rather adjust the batch size - * based on the stastistics collected here. Activated with + * based on the statistics collected here. Activated with * `--mini-batch-fit`. * In a multi-GPU scenario, the first GPU is used to determine the size. * The actual allowed size is then determined by multiplying it with the @@ -151,4 +151,4 @@ class GraphGroup { void updateAverageTrgBatchWords(size_t trgBatchWords); }; -} // namespace marian \ No newline at end of file +} // namespace marian From 266b931daa11bcd0f682d79a05542833e328849b Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Sun, 30 Jan 2022 20:11:38 +0000 Subject: [PATCH 140/254] Update list of contributors (#906) --- src/common/authors.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/common/authors.h b/src/common/authors.h index 5dcdaf283..6a714ebc0 100644 --- a/src/common/authors.h +++ b/src/common/authors.h @@ -35,31 +35,43 @@ std::string authors() { "An inevitably non-exhaustive list of contributors:\n" "\n" "Marcin Junczys-Dowmunt \n" - "Roman Grundkiewicz \n" + "Roman Grundkiewicz \n" "Frank Seide \n" "Hieu Hoang \n" - "Tomasz Dwojak \n" "Ulrich Germann \n" + "Tomasz Dwojak \n" "Alham Fikri Aji \n" - "Cédric Rousseau \n" "Young Jin Kim \n" - "Lane Schwartz \n" - "Andre Martins \n" "Nikolay Bogoychev \n" + "Andre Martins \n" + "Cédric Rousseau \n" "Kenneth Heafield \n" + "Lane Schwartz \n" "Maximiliana Behnke \n" + "Graeme Nail \n" + "Qianqian Zhu \n" + "Rohit Jain \n" "Tom Neckermann \n" "Hany Hassan Awadalla \n" "Jim Geovedi \n" + "Rihards Krišlauks \n" "Catarina Silva \n" "Jon Clark \n" - "Rihards Krišlauks \n" "Vishal Chowdhary \n" + "delong-coder \n" + "rhenry-nv \n" + "Kelly Davis \n" + "Aaron Burke \n" "Barry Haddow \n" + "David Meikle \n" "Dominik Stańczak \n" "Michael Hutt \n" "Richard Wei \n" + "Tommy MacWilliam \n" "Wenyong Huang \n" - "alancucki \n"; + "alancucki \n" + "alvations \n" + "huangjq0617 \n" + "Mateusz Chudyk \n"; } } // namespace marian From 8da539e835e8661d00697c8bd01164e64ab9ce62 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 6 Feb 2022 12:00:48 -0800 Subject: [PATCH 141/254] merged with master --- azure-pipelines.yml | 40 ++--- src/common/aliases.cpp | 4 +- src/common/config_parser.cpp | 28 +++- src/common/definitions.h | 10 +- src/common/utils.cpp | 8 +- src/data/batch_generator.h | 35 +++-- src/data/corpus.cpp | 152 ++++++++++-------- src/data/corpus.h | 3 + src/data/corpus_base.cpp | 77 ++++++++- src/data/corpus_base.h | 105 ++++++++++++- src/data/corpus_nbest.cpp | 7 +- src/data/corpus_sqlite.cpp | 6 +- src/data/sentencepiece_vocab.cpp | 8 +- src/data/text_input.cpp | 6 +- src/graph/expression_operators.cpp | 7 + src/graph/expression_operators.h | 15 +- src/graph/node_operators_binary.h | 61 ++++++- src/graph/node_operators_tuple.h | 2 +- src/layers/output.cpp | 22 ++- src/models/costs.cpp | 35 +++++ src/models/costs.h | 32 ++-- src/models/encoder_decoder.cpp | 2 + src/models/model_factory.cpp | 21 ++- src/models/transformer.h | 36 +++-- src/tensors/cpu/tensor_operators.cpp | 13 +- src/tensors/gpu/element.cu | 12 +- src/tensors/gpu/prod.cpp | 6 +- src/tensors/gpu/tensor_operators.cu | 210 +++++++++++++++++-------- src/tensors/tensor_operators.h | 42 ++++- src/training/graph_group.cpp | 135 ++++++++-------- src/training/graph_group.h | 18 +-- src/training/graph_group_async.cpp | 6 +- src/training/graph_group_singleton.cpp | 8 +- src/training/graph_group_sync.cpp | 8 +- src/translator/beam_search.cpp | 5 +- src/translator/nth_element.cpp | 2 + src/translator/translator.h | 2 +- 37 files changed, 849 insertions(+), 340 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4f7ce02da..bc76f85c9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,6 +6,13 @@ # 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file # 4. "More actions" > "Save" +parameters: +# Allow skipping the entire 'Build' stage +- name: runBuilds + displayName: Run builds? Uncheck to run regression tests only. + type: boolean + default: true + # The pipeline CI trigger is set on the branch master only and PR trigger on a # (non-draft) pull request to any branch trigger: @@ -45,6 +52,7 @@ stages: ###################################################################### - job: BuildWindows + condition: eq(${{ parameters.runBuilds }}, true) displayName: Windows strategy: @@ -180,6 +188,7 @@ stages: ###################################################################### - job: BuildUbuntu + condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu timeoutInMinutes: 90 @@ -237,17 +246,7 @@ stages: examples: true static: true ################################################################ - # Ubuntu 16.04 supports CUDA 8+ - "16.04 CUDA 9.2 gcc-7": - image: ubuntu-16.04 - boost: true - cpu: true - gpu: true - cuda: 9.2 - gcc: 7 - unit_tests: true - examples: true - static: false + # Ubuntu 16.04 is no longer available on Azure-hosted machines pool: vmImage: $(image) @@ -322,18 +321,17 @@ stages: ###################################################################### - job: BuildUbuntuMinimal - displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5 + condition: eq(${{ parameters.runBuilds }}, true) + displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5 pool: - vmImage: ubuntu-16.04 + vmImage: ubuntu-18.04 steps: - checkout: self submodules: true # The script simplifies installation of different versions of CUDA. - # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9. - # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work... - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0" displayName: Install CUDA @@ -346,10 +344,10 @@ stages: # GCC 5 is the minimum version supported - bash: | - /usr/bin/gcc-5 --version + /usr/bin/gcc-7 --version mkdir -p build cd build - CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \ + CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \ ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \ -DCOMPILE_CPU=on \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 @@ -368,10 +366,11 @@ stages: ###################################################################### - job: BuildMacOS + condition: eq(${{ parameters.runBuilds }}, true) displayName: macOS CPU clang pool: - vmImage: macos-latest + vmImage: macos-10.15 steps: - checkout: self @@ -416,6 +415,7 @@ stages: ###################################################################### - job: BuildInstall + condition: eq(${{ parameters.runBuilds }}, true) displayName: Linux CPU library install pool: @@ -580,7 +580,7 @@ stages: # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' # instead of '/', which often breaks the job - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests @@ -677,7 +677,7 @@ stages: AWS_SECRET_SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 36613327e..b38ccc648 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -31,8 +31,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { cli.alias("fp16", "true", [&](YAML::Node& config) { if(mode_ == cli::mode::training) { config["precision"] = std::vector({"float16", "float32"}); // inference type, optimization type, save type - // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor - config["cost-scaling"] = std::vector({"0", "1000", "2", "0.05", "10", "1e-5"}); + // scaling factor, frequency, multiplier at increase, minium scaling factor + config["cost-scaling"] = std::vector({"256.f", "1000", "2.f", "256.f"}); } else { config["precision"] = std::vector({"float16"}); // for inference we do not need the other types } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 333d87a7a..ebbe4a89a 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -267,10 +267,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", - 2048); + 2048); + cli.add("--transformer-decoder-dim-ffn", + "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", + 0); cli.add("--transformer-ffn-depth", "Depth of filters (transformer)", 2); + cli.add("--transformer-decoder-ffn-depth", + "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0", + 0); cli.add("--transformer-ffn-activation", "Activation between filters: swish or relu (transformer)", "swish"); @@ -528,15 +534,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " - "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f"); + "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", {"float32", "float32"}); cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " - "power of 2, scaling window, scaling factor, tolerance, range, minimum factor") - ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f"); + "scaling factor, frequency, multiplier, minimum factor") + ->implicit_val("256.f 1000 2.f 256.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", @@ -702,9 +708,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Use softmax shortlist: path first best prune"); cli.add>("--weights", "Scorer weights"); - cli.add("--output-sampling", - "Noise output layer with gumbel noise", - false); + cli.add>("--output-sampling", + "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. " + " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.") + ->implicit_val("full"); cli.add>("--output-approx-knn", "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); @@ -889,6 +896,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { if(mode_ == cli::mode::training) { cli.add("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); + + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); + // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add("--all-caps-every", "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8"); @@ -907,6 +918,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { cli.add("--mini-batch-round-up", "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); + } else { + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); } // clang-format on } diff --git a/src/common/definitions.h b/src/common/definitions.h index 159791d09..e28ea5dcf 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -106,24 +106,24 @@ using Weak = std::weak_ptr; /** @brief Creates shared_ptr of any type, passes all arguments to any available * constructor */ template -Ptr New(Args&&... args) { - return Ptr(new T(std::forward(args)...)); +inline Ptr New(Args&&... args) { + return std::make_shared(std::forward(args)...); } template -Ptr New(Ptr p) { +inline Ptr New(Ptr p) { return Ptr(p); } /** @brief Creates InstrusivePtr of any type, passes all arguments to any available * constructor */ template -IPtr INew(Args&&... args) { +inline IPtr INew(Args&&... args) { return IPtr(new T(std::forward(args)...)); } template -IPtr INew(Ptr p) { +inline IPtr INew(Ptr p) { return IPtr(p); } diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 72624041f..99fc790a2 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -70,22 +70,20 @@ void split(const std::string& line, // the function guarantees that the output has as many elements as requested void splitTsv(const std::string& line, std::vector& fields, size_t numFields) { fields.clear(); + fields.resize(numFields); // make sure there is as many elements as requested size_t begin = 0; size_t pos = 0; for(size_t i = 0; i < numFields; ++i) { pos = line.find('\t', begin); if(pos == std::string::npos) { - fields.push_back(line.substr(begin)); + fields[i] = line.substr(begin); break; } - fields.push_back(line.substr(begin, pos - begin)); + fields[i] = line.substr(begin, pos - begin); begin = pos + 1; } - if(fields.size() < numFields) // make sure there is as many elements as requested - fields.resize(numFields); - ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line); } diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index a248db23a..ea9774682 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -2,6 +2,7 @@ #include "common/options.h" #include "common/signal_handling.h" +#include "common/timer.h" #include "data/batch_stats.h" #include "data/rng_engine.h" #include "training/training_state.h" @@ -92,6 +93,8 @@ class BatchGenerator : public RNGEngine { // this runs on a bg thread; sequencing is handled by caller, but locking is done in here std::deque fetchBatches() { + timer::Timer total; + typedef typename Sample::value_type Item; auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content @@ -135,19 +138,29 @@ class BatchGenerator : public RNGEngine { if(current_ != data_->end()) ++current_; } - size_t sets = 0; - while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data + + Samples maxiBatchTemp; + while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data if (saveAndExitRequested()) // stop generating batches return std::deque(); - maxiBatch->push(*current_); - sets = current_->size(); + + maxiBatchTemp.push_back(*current_); + // do not consume more than required for the maxi batch as this causes // that line-by-line translation is delayed by one sentence - bool last = maxiBatch->size() == maxSize; + bool last = maxiBatchTemp.size() == maxSize; if(!last) ++current_; // this actually reads the next line and pre-processes it } - size_t numSentencesRead = maxiBatch->size(); + size_t numSentencesRead = maxiBatchTemp.size(); + + size_t sets = 0; + for(auto&& s : maxiBatchTemp) { + if(!s.empty()) { + sets = s.size(); + maxiBatch->push(s); + } + } // construct the actual batches and place them in the queue Samples batchVector; @@ -163,6 +176,7 @@ class BatchGenerator : public RNGEngine { BatchStats::const_iterator cachedStatsIter; if (stats_) cachedStatsIter = stats_->begin(); + while(!maxiBatch->empty()) { // while there are sentences in the queue if (saveAndExitRequested()) // stop generating batches return std::deque(); @@ -178,12 +192,7 @@ class BatchGenerator : public RNGEngine { lengths[i] = batchVector.back()[i].size(); // record max lengths so far maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter); - // this optimization makes no difference indeed -#if 0 // sanity check: would we find the same entry if searching from the start? - auto it = stats_->lower_bound(lengths); - auto maxBatchSize1 = stats_->findBatchSize(lengths, it); - ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked"); -#endif + makeBatch = batchVector.size() >= maxBatchSize; // if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch if(batchVector.size() > maxBatchSize) { @@ -231,6 +240,8 @@ class BatchGenerator : public RNGEngine { LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.", tempBatches.size(), numSentencesRead, (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom); + LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed()); + return tempBatches; } diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index d8a364b2e..643a7de93 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -14,18 +14,30 @@ namespace data { Corpus::Corpus(Ptr options, bool translate /*= false*/, size_t seed /*= Config:seed*/) : CorpusBase(options, translate, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} Corpus::Corpus(std::vector paths, std::vector> vocabs, Ptr options, size_t seed /*= Config:seed*/) : CorpusBase(paths, vocabs, options, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; @@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { } SentenceTuple Corpus::next() { - // Used for handling TSV inputs - // Determine the total number of fields including alignments or weights - auto tsvNumAllFields = tsvNumInputFields_; - if(alignFileIdx_ > -1) - ++tsvNumAllFields; - if(weightFileIdx_ > -1) - ++tsvNumAllFields; - std::vector fields(tsvNumAllFields); - - for(;;) { // (this is a retry loop for skipping invalid sentences) + size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); + std::vector fields(numStreams); + + while(true) { // retry loop // get index of the current sentence size_t curId = pos_; // note: at end, pos_ == total size // if corpus has been shuffled, ids_ contains sentence indexes @@ -69,83 +75,91 @@ SentenceTuple Corpus::next() { curId = ids_[pos_]; pos_++; - // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); size_t eofsHit = 0; - size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); - for(size_t i = 0; i < numStreams; ++i) { - std::string line; - + for(size_t i = 0; i < numStreams; ++i) { // looping of all streams // fetch line, from cached copy in RAM or actual file if (!corpusInRAM_.empty()) { if (curId < corpusInRAM_[i].size()) - line = corpusInRAM_[i][curId]; + fields[i] = corpusInRAM_[i][curId]; else { eofsHit++; continue; } } else { - bool gotLine = io::getline(*files_[i], line).good(); + bool gotLine = io::getline(*files_[i], fields[i]).good(); if(!gotLine) { eofsHit++; continue; } } + } - if(i > 0 && i == alignFileIdx_) { - addAlignmentToSentenceTuple(line, tup); - } else if(i > 0 && i == weightFileIdx_) { - addWeightsToSentenceTuple(line, tup); - } else { - if(tsv_) { // split TSV input and add each field into the sentence tuple - utils::splitTsv(line, fields, tsvNumAllFields); - size_t shift = 0; - for(size_t j = 0; j < tsvNumAllFields; ++j) { - // index j needs to be shifted to get the proper vocab index if guided-alignment or - // data-weighting are preceding source or target sequences in TSV input - if(j == alignFileIdx_ || j == weightFileIdx_) { - ++shift; - } else { - size_t vocabId = j - shift; - bool altered; - preprocessLine(fields[j], vocabId, /*out=*/altered); - if (altered) - tup.markAltered(); - addWordsToSentenceTuple(fields[j], vocabId, tup); - } - } - - // weights are added last to the sentence tuple, because this runs a validation that needs - // length of the target sequence - if(alignFileIdx_ > -1) - addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); - if(weightFileIdx_ > -1) - addWeightsToSentenceTuple(fields[weightFileIdx_], tup); + if(eofsHit == numStreams) + return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested + ABORT_IF(eofsHit != 0, "not all input files have the same number of lines"); + + auto makeSentenceTuple = [this](size_t curId, std::vector fields) { + if(tsv_) { + // with tsv inputs data, there is only one input stream, hence we only have one field + // which needs to be tokenized into tab-separated fields + ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??"); + size_t numAllFields = tsvNumInputFields_; + if(alignFileIdx_ > -1) + ++numAllFields; + if(weightFileIdx_ > -1) + ++numAllFields; + // replace single-element fields array with extracted tsv fields + std::vector tmpFields; + utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields + fields.swap(tmpFields); + } + + // fill up the sentence tuple with sentences from all input files + SentenceTupleImpl tup(curId); + size_t shift = 0; + for(size_t i = 0; i < fields.size(); ++i) { + // index j needs to be shifted to get the proper vocab index if guided-alignment or + // data-weighting are preceding source or target sequences in TSV input + if(i == alignFileIdx_ || i == weightFileIdx_) { + ++shift; } else { + size_t vocabId = i - shift; bool altered; - preprocessLine(line, i, /*out=*/altered); + preprocessLine(fields[i], vocabId, /*out=*/altered); if (altered) tup.markAltered(); - addWordsToSentenceTuple(line, i, tup); + addWordsToSentenceTuple(fields[i], vocabId, tup); } + + // weights are added last to the sentence tuple, because this runs a validation that needs + // length of the target sequence + if(alignFileIdx_ > -1) + addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); + if(weightFileIdx_ > -1) + addWeightsToSentenceTuple(fields[weightFileIdx_], tup); } - } - - if (eofsHit == numStreams) - return SentenceTuple(0); - ABORT_IF(eofsHit != 0, "not all input files have the same number of lines"); - // check if all streams are valid, that is, non-empty and no longer than maximum allowed length - if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { - return words.size() > 0 && words.size() <= maxLength_; - })) - return tup; + // check if all streams are valid, that is, non-empty and no longer than maximum allowed length + if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { + return words.size() > 0 && words.size() <= maxLength_; + })) { + return tup; + } else { + return SentenceTupleImpl(); // return an empty tuple if above test does not pass + } + }; + + if(threadPool_) { // use thread pool if available + return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields)); + } else { // otherwise launch here and just pass the result into the wrapper + auto tup = makeSentenceTuple(curId, fields); + if(!tup.empty()) + return SentenceTuple(tup); + } - // otherwise skip this sentence and try the next one - // @TODO: tail recursion? - } + } // end of retry loop } // reset and initialize shuffled reading @@ -167,6 +181,8 @@ void Corpus::reset() { pos_ = 0; for (size_t i = 0; i < paths_.size(); ++i) { if(paths_[i] == "stdin" || paths_[i] == "-") { + std::cin.tie(0); + std::ios_base::sync_with_stdio(false); files_[i].reset(new std::istream(std::cin.rdbuf())); // Probably not necessary, unless there are some buffers // that we want flushed. diff --git a/src/data/corpus.h b/src/data/corpus.h index e8e9a9fdb..281d43a22 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -4,6 +4,7 @@ #include #include +#include "3rd_party/threadpool.h" #include "common/definitions.h" #include "common/file_stream.h" #include "common/options.h" @@ -20,6 +21,8 @@ class Corpus : public CorpusBase { private: std::vector> tempFiles_; std::vector ids_; + + UPtr threadPool_; // thread pool for parallelized data reading // for shuffle-in-ram bool shuffleInRAM_{false}; diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 9d95a1214..20301103d 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -12,7 +12,24 @@ typedef std::vector MaskBatch; typedef std::pair WordMask; typedef std::vector SentBatch; -CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {} +void SentenceTupleImpl::setWeights(const std::vector& weights) { + if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine + ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); + auto numWeights = weights.size(); + auto numTrgWords = back().size(); + // word-level weights may or may not contain a weight for EOS tokens + if(numWeights != numTrgWords && numWeights != numTrgWords - 1) + LOG(warn, + "[warn] " + "Number of weights ({}) does not match the number of target words ({}) in line #{}", + numWeights, + numTrgWords, + id_); + } + weights_ = weights; +} + +CorpusIterator::CorpusIterator() : pos_(-1) {} CorpusIterator::CorpusIterator(CorpusBase* corpus) : corpus_(corpus), pos_(0), tup_(corpus_->next()) {} @@ -23,7 +40,7 @@ void CorpusIterator::increment() { } bool CorpusIterator::equal(CorpusIterator const& other) const { - return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty()); + return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); } const SentenceTuple& CorpusIterator::dereference() const { @@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) void CorpusBase::addWordsToSentenceTuple(const std::string& line, size_t batchIndex, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { // This turns a string in to a sequence of numerical word ids. Depending // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece // is used. @@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, } void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used " "together at the moment"); @@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, tup.setAlignment(align); } -void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const { +void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { auto elements = utils::split(line, " "); if(!elements.empty()) { @@ -549,6 +566,7 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr options) { return 0; } +<<<<<<< HEAD void SentenceTuple::setWeights(const std::vector& weights) { if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); @@ -564,6 +582,55 @@ void SentenceTuple::setWeights(const std::vector& weights) { id_); } weights_ = weights; +======= +// experimental: hide inline-fix source tokens from cross attention +std::vector SubBatch::crossMaskWithInlineFixSourceSuppressed() const +{ + const auto& srcVocab = *vocab(); + + auto factoredVocab = vocab()->tryAs(); + size_t inlineFixGroupIndex = 0, inlineFixSrc = 0; + auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc); + + auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG]; + auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG]; + auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG]; + auto unkId = srcVocab.getUnkId(); + auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId; + + auto m = mask(); // default return value, which we will modify in-place below in case we need to + if (hasInlineFixFactors || hasInlineFixTags) { + LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens"); + + // example: force French translation of name "frank" to always be "franck" + // - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to + // - hasInlineFixTags: " frank franck ", "frank" and all tags cannot be cross-attended to + auto dimBatch = batchSize(); // number of sentences in the batch + auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch + const auto& d = data(); + size_t numWords = 0; + for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries + bool inside = false; + for (size_t s = 0; s < dimWidth; s++) { // loop over source positions + auto i = locate(/*batchIdx=*/b, /*wordPos=*/s); + if (!m[i]) + break; + numWords++; + // keep track of entering/exiting the inline-fix source tags + auto w = d[i]; + if (w == fixSrcId) + inside = true; + else if (w == fixTgtId) + inside = false; + bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc; + if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor) + m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens + } + } + ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??"); + } + return m; +>>>>>>> master } } // namespace data diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index d504a7ea3..a54c20f88 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -11,6 +11,8 @@ #include "data/rng_engine.h" #include "data/vocab.h" +#include + namespace marian { namespace data { @@ -22,7 +24,7 @@ namespace data { * construction of marian::data::CorpusBatch objects. They are not a part of * marian::data::CorpusBatch. */ -class SentenceTuple { +class SentenceTupleImpl { private: size_t id_; std::vector tuple_; // [stream index][step index] @@ -33,12 +35,17 @@ class SentenceTuple { public: typedef Words value_type; + /** + * @brief Creates an empty tuple with 0 id (default constructor). + */ + SentenceTupleImpl() : id_(0) {} + /** * @brief Creates an empty tuple with the given Id. */ - SentenceTuple(size_t id) : id_(id) {} + SentenceTupleImpl(size_t id) : id_(id) {} - ~SentenceTuple() { tuple_.clear(); } + ~SentenceTupleImpl() {} /** * @brief Returns the sentence's ID. @@ -114,6 +121,92 @@ class SentenceTuple { void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; } }; +class SentenceTuple { +private: + std::shared_ptr> fImpl_; + mutable std::shared_ptr impl_; + +public: + typedef Words value_type; + + /** + * @brief Creates an empty tuple with no associated future. + */ + SentenceTuple() {} + + SentenceTuple(const SentenceTupleImpl& tupImpl) + : impl_(std::make_shared(tupImpl)) {} + + SentenceTuple(std::future&& fImpl) + : fImpl_(new std::future(std::move(fImpl))) {} + + SentenceTupleImpl& get() const { + if(!impl_) { + ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple"); + impl_ = std::make_shared(fImpl_->get()); + } + return *impl_; + } + + /** + * @brief Returns the sentence's ID. + */ + size_t getId() const { return get().getId(); } + + /** + * @brief Returns whether this Tuple was altered or augmented from what + * was provided to Marian in input. + */ + bool isAltered() const { return get().isAltered(); } + + /** + * @brief The size of the tuple, e.g. two for parallel data with a source and + * target sentences. + */ + size_t size() const { return get().size(); } + + /** + * @brief confirms that the tuple has been populated with data + */ + bool valid() const { + return fImpl_ || impl_; + } + + /** + * @brief The i-th tuple sentence. + * + * @param i Tuple's index. + */ + Words& operator[](size_t i) { return get()[i]; } + const Words& operator[](size_t i) const { return get()[i]; } + + /** + * @brief The last tuple sentence, i.e. the target sentence. + */ + Words& back() { return get().back(); } + const Words& back() const { return get().back(); } + + /** + * @brief Checks whether the tuple is empty. + */ + bool empty() const { return get().empty(); } + + auto begin() const -> decltype(get().begin()) { return get().begin(); } + auto end() const -> decltype(get().end()) { return get().end(); } + + auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); } + auto rend() const -> decltype(get().rend()) { return get().rend(); } + + /** + * @brief Get sentence weights. + * + * For sentence-level weights the vector contains only one element. + */ + const std::vector& getWeights() const { return get().getWeights(); } + + const WordAlignment& getAlignment() const { return get().getAlignment(); } +}; + /** * @brief Batch of sentences represented as word indices with masking. */ @@ -583,17 +676,17 @@ class CorpusBase : public DatasetBase batch, const std::vector& batchVector); diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp index d5a48d8df..8029d3516 100644 --- a/src/data/corpus_nbest.cpp +++ b/src/data/corpus_nbest.cpp @@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() { pos_++; // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); std::string line; lastLines_.resize(files_.size() - 1); @@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() { if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + + return SentenceTuple(); } void CorpusNBest::reset() { diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp index 297847c04..f7c577f29 100644 --- a/src/data/corpus_sqlite.cpp +++ b/src/data/corpus_sqlite.cpp @@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() { while(select_->executeStep()) { // fill up the sentence tuple with sentences from all input files size_t curId = select_->getColumn(0).getInt(); - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { auto line = select_->getColumn((int)(i + 1)); @@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() { if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + return SentenceTuple(); } void CorpusSQLite::shuffle() { diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 090d478b2..8f774c2bb 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -236,18 +236,20 @@ class SentencePieceVocab : public IVocab { return words; } - std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override { + std::string decode(const Words& sentence, bool ignoreEOS) const override { std::string line; if(keepEncoded_) { // i.e. keep the sentence segmented into subword units for(const Word& id : sentence) - line += (*this)[id] + " "; + if(!ignoreEOS || id != getEosId()) + line += (*this)[id] + " "; line.pop_back(); // trim the trailing whitespace } else { // convert vector of Word to vector of int std::vector spmSentence; spmSentence.reserve(sentence.size()); for(auto&& word : sentence) - spmSentence.push_back(word.toWordIndex()); + if(!ignoreEOS || word != getEosId()) + spmSentence.push_back(word.toWordIndex()); spm_->Decode(spmSentence, &line); } return line; diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index 958190fce..b1f4cdd47 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -40,7 +40,7 @@ SentenceTuple TextInput::next() { size_t curId = pos_++; // fill up the sentence tuple with source and/or target sentences - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { std::string line; if(io::getline(*files_[i], line)) { @@ -57,9 +57,9 @@ SentenceTuple TextInput::next() { } if(tup.size() == files_.size()) // check if each input file provided an example - return tup; + return SentenceTuple(tup); else if(tup.size() == 0) // if no file provided examples we are done - return SentenceTuple(0); + return SentenceTuple(); else // neither all nor none => we have at least on missing entry ABORT("There are missing entries in the text tuples."); } diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 322a29ad0..5294fca3f 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) { return Expression(a, axis, indices); } +// scatter() -- scatter arbitrary elements along an axis; batched or non-batched +// This is the reverse operation to gather. +Expr scatter(Expr a, int axis, Expr indices, Expr source) { + return Expression(a, axis, indices, source); +} + + // index_select() -- gather arbitrary elements along an axis from an unbatched // input 'a'. Indices are specified as a 1D vector. // This is used e.g. for embedding lookup. diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index dc756c7d6..1e98047f9 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -687,10 +687,23 @@ Expr stopGradient(Expr a); * @param indices The indices to be gathered * @returns Gathered expression with the same shape as @p indices * @note @p a and @p indices must have the same rank - * @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable. + * @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable. */ Expr gather(Expr a, int axis, Expr indices); +/** + * Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged. + * This is the reverse operation to gather. + * @param a The input expression + * @param axis The axis along which to index + * @param indices The indices to be scattered + * @param source Expression with values to scatter. + * @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices + * @note @p source and @p indices must have the same rank + * @note In this version @p source and @p indicies must have the same shape + */ +Expr scatter(Expr a, int axis, Expr indices, Expr source); + #if 0 // reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis. // with broadcasting diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index a180bb5c8..b2a646b1c 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( + // @TODO: rename to gather Select(val_, child(0)->val(), child(1)->val(), axis_))}; } NodeOps backwardOps() override { return {NodeOp( - Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; + // @TODO: rename to scatter + Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; } Shape newShape(Expr a, int axis, Expr indices) { @@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp { axis = shape.axis(axis); auto rank = shape.size(); ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape())); - axis = a->shape().axis(axis); shape.set(axis, indices->shape()[axis]); for (size_t i = 0; i < rank; ++i) { if (i != axis) { @@ -1086,6 +1087,62 @@ struct GatherNodeOp : public NaryNodeOp { int axis_; }; +struct ScatterNodeOp : public NaryNodeOp { + ScatterNodeOp(Expr a, int axis, Expr indices, Expr source) + : NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()), + axis_(a->shape().axis(axis)) { + matchOrAbort(indices->value_type()); + } + + NodeOps forwardOps() override { + return {NodeOp( + CopyCast(val_, child(0)->val()); // @TODO: use normal copy + Insert(val_, child(2)->val(), child(1)->val(), axis_) + )}; + } + + NodeOps backwardOps() override { + ABORT("backward for ScatterNodeOp not yet implemented"); + } + + Shape newShape(Expr a, int axis, Expr indices, Expr source) { + ABORT_IF(axis != -1, "only last dimensions"); + ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); + + Shape shape = a->shape(); + // @TODO: do proper checking + return shape; + } + + const std::string type() override { return "scatter"; } + + const std::string color() override { return "orange"; } + + virtual size_t hash() override { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, axis_); + hash_ = seed; + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + return true; + } + +private: + friend class SerializationHelpers; + int axis_; +}; + struct ColsNodeOp : public NaryNodeOp { ColsNodeOp(Expr a, Expr indices) : NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) { diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h index c7a9531a1..8acb1bc83 100644 --- a/src/graph/node_operators_tuple.h +++ b/src/graph/node_operators_tuple.h @@ -133,7 +133,7 @@ struct TopKNodeOp : public UnaryNodeOp, } void backward() override { - Insert(/*out*/child(0)->grad(), adj_, val_, axis_); + Insert(/*out*/child(0)->grad(), adj_, val_, axis_); } const std::string type() override { return "topk"; } diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 4d6e488a4..efff58df4 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -309,14 +309,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } return Logits(std::move(allLogits), factoredVocab_); } else if(shortlist_) { - return Logits(affineOrDot(input, - shortlist_->getCachedShortWt(), - shortlist_->getCachedShortb(), + const Shape &inputShape = input->shape(); + assert(inputShape[1] == 1); // time dimension always 1 for decoding + input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]}); + + Expr Wt = shortlist_->getCachedShortWt(); + Expr b = shortlist_->getCachedShortb(); + Expr ret = affineShortlist(input, + Wt, + b, false, - /*transB=*/isLegacyUntransposedW ? false : true)); + /*transB=*/isLegacyUntransposedW ? false : true); + const Shape &retShape = ret->shape(); + assert(retShape[2] == 1); // time dimension always 1 for decoding + ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]}); + return Logits(ret); } else { - return Logits( - affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true)); + Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true); + return Logits(ret); } } diff --git a/src/models/costs.cpp b/src/models/costs.cpp index c688b2119..4b15bcb36 100644 --- a/src/models/costs.cpp +++ b/src/models/costs.cpp @@ -10,5 +10,40 @@ Ptr LogSoftmaxStep::apply(Ptr state) { return state; } +Ptr GumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [](Expr logits) { // lemma gets gumbelled + return logsoftmax(logits + constant_like(logits, inits::gumbel())); + }, + logsoftmax)); // factors don't + return state; +} + +TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {} + +Ptr TopkGumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [=](Expr logits) { // lemma gets gumbelled + // create logits-sized tensor consisting only of invalid path scores + float invalidPathScore = NumericLimits(logits->value_type()).lowest; + Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore)); + + // select top-k values + Expr val, idx; + std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true); + + // uncomment below to display probability mass in top-k selection + // debug(sum(gather(softmax(logits), -1, idx), -1), "sum"); + + // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search + Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); + + // Scatter gumbelled values back into logits to fill with usable values + return scatter(invalidLogits, -1, idx, gumbelVal); + }, + logsoftmax)); // factors don't + return state; +} + } // namespace models } // namespace marian diff --git a/src/models/costs.h b/src/models/costs.h index 982a13c57..9bb2b1039 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -297,20 +297,30 @@ class LogSoftmaxStep : public ILogProbStep { virtual Ptr apply(Ptr state) override; }; -// Gumbel-max noising for sampling during beam-search -// Seems to work well enough with beam-size=1. Turn on -// with --output-sampling during translation with marian-decoder +// Gumbel-max noising for sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling [full] during translation +// with marian-decoder for samnpling from the full +// softmax distribution. class GumbelSoftmaxStep : public ILogProbStep { public: virtual ~GumbelSoftmaxStep() {} - virtual Ptr apply(Ptr state) override { - state->setLogProbs(state->getLogProbs().applyUnaryFunctions( - [](Expr logits) { // lemma gets gumbelled - return logsoftmax(logits + constant_like(logits, inits::gumbel())); - }, - logsoftmax)); // factors don't - return state; - } + virtual Ptr apply(Ptr state) override; +}; + + +// Gumbel-max noising for top-k sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling topk [10] during translation +// with marian-decoder for top-10 sampling. +class TopkGumbelSoftmaxStep : public ILogProbStep { +private: + int k_{1}; + +public: + TopkGumbelSoftmaxStep(int k); + virtual ~TopkGumbelSoftmaxStep() {} + virtual Ptr apply(Ptr state) override; }; // class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence, diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 5711ea1b8..a6f4dd3dc 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); modelFeatures_.insert("transformer-dim-ffn"); + modelFeatures_.insert("transformer-decoder-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); + modelFeatures_.insert("transformer-decoder-ffn-depth"); modelFeatures_.insert("transformer-ffn-activation"); modelFeatures_.insert("transformer-dim-aan"); modelFeatures_.insert("transformer-aan-depth"); diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index e176e6a4c..52a87e72a 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -370,10 +370,25 @@ Ptr createModelFromOptions(Ptr options, usage use) { // add (log)softmax if requested if (use == usage::translation) { if(std::dynamic_pointer_cast(baseModel)) { - if(options->get("output-sampling", false)) - return New(std::dynamic_pointer_cast(baseModel), New()); - else + if(options->hasAndNotEmpty("output-sampling")) { + auto sampling = options->get>("output-sampling", {}); + std::string method = sampling.size() > 0 ? sampling[0] : "full"; + + if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { + LOG(info, "Output sampling from the full softmax distribution"); + return New(std::dynamic_pointer_cast(baseModel), New()); + } else if(method == "topk") { + int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10; + if(k == 1) + LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); + LOG(info, "Output sampling via top-{} sampling", k); + return New(std::dynamic_pointer_cast(baseModel), New(k)); + } else { + ABORT("Unknown sampling method: {}", method); + } + } else { return New(std::dynamic_pointer_cast(baseModel), New()); + } } #ifdef COMPILE_EXAMPLES // note: 'usage::translation' here means 'inference' diff --git a/src/models/transformer.h b/src/models/transformer.h index ec68b801a..95a55d3aa 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -148,8 +148,7 @@ class Transformer : public EncoderOrDecoderBase { int dimDepth = dimModel / dimHeads; - auto output - = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); + auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth] } @@ -364,9 +363,9 @@ class Transformer : public EncoderOrDecoderBase { Expr LayerAttention(std::string prefix, Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] - const Expr& keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] - const Expr& values, // ...? - const Expr& mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] + Expr keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + Expr values, // ...? + Expr mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] int dimHeads, bool cache = false, bool saveAttentionWeights = false) { @@ -376,6 +375,12 @@ class Transformer : public EncoderOrDecoderBase { auto opsPre = opt("transformer-preprocess"); auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb); + // fixes missing norm for keys and values in self-attention with pre-norm + if(input == keys) + keys = output; + if(input == values) + values = output; + // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); @@ -403,7 +408,7 @@ class Transformer : public EncoderOrDecoderBase { opt("transformer-heads"), /*cache=*/false); } - Expr LayerFFN(std::string prefix, Expr input) const { + Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const { int dimModel = input->shape()[-1]; float dropProb = inference_ ? 0 : opt("transformer-dropout"); @@ -411,13 +416,22 @@ class Transformer : public EncoderOrDecoderBase { auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb); auto actName = opt("transformer-ffn-activation"); + int dimFfn = opt("transformer-dim-ffn"); int depthFfn = opt("transformer-ffn-depth"); - float ffnDropProb - = inference_ ? 0 : opt("transformer-dropout-ffn"); - + if(isDecoder) { + int decDimFfn = opt("transformer-decoder-dim-ffn", 0); + if(decDimFfn != 0) + dimFfn = decDimFfn; + + int decDepthFfn = opt("transformer-decoder-ffn-depth", 0); + if(decDepthFfn != 0) + depthFfn = decDepthFfn; + } + ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn); - + + float ffnDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f); // the stack of FF layers @@ -866,7 +880,7 @@ class DecoderTransformer : public Transformer { // remember decoder state decoderStates.push_back(decoderState); - query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] checkpoint(query); } diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1afb8f648..1e1adc38b 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr /*allocator*/, bool& /*isNaN*/, b ABORT("Not implemented"); } +bool SanitizeGradient(marian::Tensor /*in*/, Ptr /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) { + ABORT("Not implemented"); +} + template void CopyCastTo(To* out, const From* in, int length) { for(int i = 0; i < length; ++i) @@ -735,6 +739,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -756,10 +761,16 @@ void Insert(Tensor out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axisCPU] = (int)indices->data()[idxIndex]; int outIndex = outShape.index(dims); - out->data()[outIndex] += in->data()[index]; + if(add) + out->data()[outIndex] += in->data()[index]; + else + out->data()[outIndex] = in->data()[index]; } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + void GRUFastForward(Tensor out_, std::vector inputs, bool final) { int rows = out_->shape().elements() / out_->shape().back(); int cols = out_->shape().back(); diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu index 6790efd4b..e9cbe0812 100755 --- a/src/tensors/gpu/element.cu +++ b/src/tensors/gpu/element.cu @@ -29,7 +29,9 @@ __global__ void gElement( indices[i] = tensors[i].shape().bindex(dims); } - tensors[0].data()[index] = functional::apply(functor, tensors, indices); + // This performs the internal application of the functor in float32 regardless of the input type. + // It seems there are no speed penalties but improved precision. + tensors[0].data()[index] = (T)functional::applyWithCast(functor, tensors, indices); } } } @@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) { ElementTyped(functor, out, tensors...); } else if(out->type() == Type::float16) { #if COMPILE_FP16 - std::vector ts({out, tensors...}); - bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; }); - if(div2) { - ElementTyped(functor, out, tensors...); - } else { - ElementTyped(functor, out, tensors...); - } + ElementTyped(functor, out, tensors...); #else ABORT("FP16 not supported with chosen current hardware or CUDA version"); #endif diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index bf0d23957..c72af4db9 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C, ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); #if COMPILE_FP16 } else if(C->type() == Type::float16) { // not a *.cu file - ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); + // we use computeType=float here for fp16 training as this seems more stable and roughly as fast + ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); + + // original for reference: + // ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); #endif } else { ABORT("ProdBatchedLegacy not implemented for element type {}", C->type()); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index d55214bc7..2103ca9de 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -16,15 +16,12 @@ namespace gpu { namespace atomics { static inline __device__ void atomicAdd(float *address, float val) { - //*address += val; ::atomicAdd(address, val); } #if COMPILE_FP16 // @TODO: copied from CuTorch, adapt this better, give credit. static inline __device__ void atomicAdd(half *address, half val) { - //*address += val; - #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10 ::atomicAdd(address, val); #else // __CUDA_ARCH__ < 700 @@ -50,7 +47,8 @@ static inline __device__ void atomicAdd(half *address, half val) { } while (assumed != old); #endif // __CUDA_ARCH__ } -#endif +#endif // COMPILE_FP16 + } @@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr allocator, bool& isNaN, bool& isInf) cudaStreamSynchronize(0); } +template +__global__ void gSanitizeGradient(T* in, int length, + bool* isNaN, bool* isInf, + bool pruneNaN, bool clipInf, + float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) { + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + float v = (float)in[index]; + // handle NaN + if(isnan(v)) { + if(pruneNaN) { + in[index] = (T)forNaN; + } else { + *isNaN = true; + } + } + // handle +/- Inf + if(isinf(v)) { + if(clipInf) { + in[index] = v > 0 ? (T)forInf : (T)forInfNeg; + } else { + *isInf = true; + } + } + } + } +} + +// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. +// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. +// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. +// In that case infinities do not result in a bad gradient, since they get clipped. +// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result +// in a bad gradient. +// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), +// we return `false` indicating a bad gradient. +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { + cudaSetDevice(in->getDeviceId().no); + + int length = in->size(); + + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + auto mem = allocator->alloc(2); + bool* dIsNaN = &mem->data()[0]; + bool* dIsInf = &mem->data()[1]; + fill(in->getBackend(), dIsNaN, dIsNaN + 2, false); + + float forNaN = 0.f; + float forInf = NumericLimits(in->type()).max; + float forInfNeg = NumericLimits(in->type()).lowest; + + if(in->type() == Type::float32) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#endif + } else { + ABORT("gSanitizeGradient for type {} not implemented", in->type()); + } + + bool isNaN, isInf; + CudaCopy(dIsNaN, dIsNaN + 1, &isNaN); + CudaCopy(dIsInf, dIsInf + 1, &isInf); + + allocator->free(mem); + + cudaStreamSynchronize(0); + + return !isNaN && !isInf; +} + template __global__ void gCopyCastTo(To* out, const From* in, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { @@ -1090,7 +1163,7 @@ void PasteRows(Tensor out, size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); -#if 1 // @TODO: make this configurable with a 'deterministic' flag +#if 0 // @TODO: make this configurable with a 'deterministic' flag // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be @@ -1236,7 +1309,7 @@ __global__ void gSelect(T* out, } } -template +template __global__ void gInsert(T* out, functional::Shape outShape, const T* in, @@ -1254,7 +1327,10 @@ __global__ void gInsert(T* out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int outIndex = outShape.index(dims); - out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + if(add) + out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + else + out[outIndex] = in[index]; } } } @@ -1276,21 +1352,21 @@ void Select(Tensor out, if(out->type() == Type::float32) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else if(out->type() == Type::uint32) { gSelect<<>>(out->data(), @@ -1305,6 +1381,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -1320,28 +1397,31 @@ void Insert(Tensor out, int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else { ABORT("Insert not implemented for type {}", out->type()); } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + template __global__ void gGRUFastForward(T* out, const T* state, @@ -1355,7 +1435,7 @@ __global__ void gGRUFastForward(T* out, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOut = out + j * cols; const T* rowState = state + j * cols; @@ -1365,21 +1445,21 @@ __global__ void gGRUFastForward(T* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - T r = functional::Ops::sigmoid(xWrow[i] + sUrow[i] + b[i]); + float r = functional::Ops::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]); int k = i + cols; - T z = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); + float z = functional::Ops::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]); int l = i + 2 * cols; - T h; + float h; if(final) - h = functional::Ops::tanh(xWrow[l] + (sUrow[l] + b[l]) * r); + h = functional::Ops::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(xWrow[l] + sUrow[l] * r + b[l]); + h = functional::Ops::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]); - T out = ((T)1.f - z) * h + z * rowState[i]; - rowOut[i] = m * out + ((T)1.f - m) * rowState[i]; + float out = (1.f - z) * h + z * (float)rowState[i]; + rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]); } } } @@ -1441,7 +1521,7 @@ __global__ void gGRUFastBackward(T* outState, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOutState = outState + j * cols; T* rowOutXW = outXW + j * cols * 3; @@ -1459,56 +1539,56 @@ __global__ void gGRUFastBackward(T* outState, int k = i + cols; int l = i + 2 * cols; - T r = functional::Ops::sigmoid(rowXW[i] + rowSU[i] + b[i]); - T z = functional::Ops::sigmoid(rowXW[k] + rowSU[k] + b[k]); + float r = functional::Ops::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]); + float z = functional::Ops::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]); - T h; + float h; if(final) - h = functional::Ops::tanh(rowXW[l] + (rowSU[l] + b[l]) * r); + h = functional::Ops::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(rowXW[l] + rowSU[l] * r + b[l]); + h = functional::Ops::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]); - T adj = rowAdj[i]; + float adj = rowAdj[i]; - T t = ((T)1.f - z) * ((T)1.f - h * h); + float t = (1.f - z) * (1.f - h * h); // df/ds if(outState) - rowOutState[i] += (m * z - m + (T)1.f) * adj; + rowOutState[i] += (T)((m * z - m + 1.f) * adj); // df/d(xW_r) ... - T dfdxW_r = m * r * ((T)1.f - r) * t * adj; + float dfdxW_r = m * r * (1.f - r) * t * adj; if(final) - dfdxW_r *= rowSU[l] + b[l]; + dfdxW_r *= (float)rowSU[l] + (float)b[l]; else - dfdxW_r *= rowSU[l]; + dfdxW_r *= (float)rowSU[l]; if(outXW) - rowOutXW[i] += dfdxW_r; + rowOutXW[i] += (T)dfdxW_r; if(outSU) - rowOutSU[i] += dfdxW_r; + rowOutSU[i] += (T)dfdxW_r; if(outB) - rowOutB[i] += dfdxW_r; + rowOutB[i] += (T)dfdxW_r; // df/d(xW_z) ... - T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj; + float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj; if(outXW) - rowOutXW[k] += dfdxW_z; + rowOutXW[k] += (T)dfdxW_z; if(outSU) - rowOutSU[k] += dfdxW_z; + rowOutSU[k] += (T)dfdxW_z; if(outB) - rowOutB[k] += dfdxW_z; + rowOutB[k] += (T)dfdxW_z; // df/d(xW_x) ... - T dfdxW_x = m * t * adj; + float dfdxW_x = m * t * adj; if(outXW) - rowOutXW[l] += dfdxW_x; + rowOutXW[l] += (T)dfdxW_x; if(outSU) - rowOutSU[l] += dfdxW_x * r; + rowOutSU[l] += (T)(dfdxW_x * r); if(outB) if(final) - rowOutB[l] += dfdxW_x * r; + rowOutB[l] += (T)(dfdxW_x * r); else - rowOutB[l] += dfdxW_x; + rowOutB[l] += (T)dfdxW_x; } } } diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 6e587953c..1fc4542d8 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor); DISPATCH2(AddCast, marian::Tensor, const marian::Tensor); DISPATCH4(IsNaN, const Tensor, Ptr, bool&, bool&); +#ifdef CUDA_FOUND +namespace gpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} +#endif + +namespace cpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} + +static inline bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { +#ifdef CUDA_FOUND + if(in->getBackend()->getDeviceId().type == DeviceType::gpu) + return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); + else +#endif + return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); +} + template void Element(Functor functor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND @@ -278,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int) -DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int) + +#ifdef CUDA_FOUND +namespace gpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} +#endif + +namespace cpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} + +template +static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Insert(out, in, indices, axis); + else +#endif + cpu::Insert(out, in, indices, axis); +} DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index e9c977b9c..59cd4b6d8 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) mbRoundUp_(options_->get("mini-batch-round-up", true)) { if(options_->hasAndNotEmpty("cost-scaling")) { auto vcs = options_->get>("cost-scaling"); - costScale_ = true; - float costExponent = std::stof(vcs[0]); - costScaleFactor_ = std::pow(2.0f, costExponent); - - if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]); - if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]); - if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]); - if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]); - if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]); + + costScaling_ = true; + costScalingFactor_ = std::stof( vcs[0]); + if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]); + if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]); + if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]); LOG_ONCE(info, - "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}", - costExponent, - costScaleFactor_, - costScaleFreq_, - costScaleMultiplier_, - costScaleNanTolerance_, - costScaleNanRange_, - costScaleFactorMinimum_); + "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}", + costScalingFactor_, + costScalingFreq_, + costScalingMultiplier_, + costScalingFactorMinimum_); } if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) { @@ -37,11 +31,16 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) if(vgc.size() > 0) dynamicGradientScalingFactor_ = std::stof(vgc[0]); if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log"; + if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]); LOG_ONCE(info, "Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas", dynamicGradientScalingUseLogs_, dynamicGradientScalingFactor_); + if(dynamicGradientScalingFadeout_ > 0) + LOG_ONCE(info, + "Dynamic gradient re-scaling will fade out linearly after {} updates", + dynamicGradientScalingFadeout_); } if(options_->get("check-gradient-nan")) { @@ -96,21 +95,17 @@ void GraphGroup::initGraphsAndOpts() { // given number of iterations. Usually we increase by 2 which adds // one more bit for precision. void GraphGroup::increaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; noNanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++ - if(noNanSeen_ % costScaleFreq_ == 0) { - costScaleFactor_ *= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); + if(noNanSeen_ % costScalingFreq_ == 0) { + costScalingFactor_ *= costScalingMultiplier_; + if(isMainProcess()) + LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_); // Resetting counts after cost-scale change noNanSeen_ = 0; @@ -120,48 +115,56 @@ void GraphGroup::increaseCostScaleFactor() { // call when a NaN was seen to decrease cost-scaling factor void GraphGroup::decreaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; nanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++ - if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) { - if(costScaleFactor_ > costScaleFactorMinimum_) { - costScaleFactor_ /= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); - } else { - // @TODO: think if should this rather abort? - LOG(warn, - "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum", - nanPercent, - total, - costScaleFactor_); - } - // Resetting counts after cost-scale change - noNanSeen_ = 0; - nanSeen_ = 0; + // do not reduce cost-scaling factor below minimum + if(costScalingFactor_ > costScalingFactorMinimum_) + costScalingFactor_ /= costScalingMultiplier_; + + if(isMainProcess()) { + if(costScalingFactor_ > costScalingFactorMinimum_) + LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_); + else + LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_); } + + // Resetting counts after cost-scale change + noNanSeen_ = 0; + nanSeen_ = 0; } float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) { auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin); - if(checkGradientNan_ || costScale_) { - bool hasNan = false, hasInf = false; - IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options - if(hasNan || hasInf) { - LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf); + // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than + // the minimum. If a NaN value is seen we exit here and will reduce the factor next and + // this skips an update. + // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces + // NaNs with 0. Updates are not skipped any more. + // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type. + // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent + // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. + if(costScaling_ || checkGradientNan_) { + bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_; + bool clipInf = !checkGradientNan_; + bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf); + + // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed. + ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??"); + + if(!saneGradient) { + LOG(debug, "Found NaN"); return std::numeric_limits::quiet_NaN(); } } - + + // The optional clipping above will affect the norm here. The norm can be non-finite despite the above + // gradient sanitization, hence check again and propagate a NaN. if(dynamicGradientScaling_) { auto gNorm = L2Norm(curGrad, graphs_[i]->allocator()); if(isFinite(gNorm) && gNorm > 0.0) @@ -197,8 +200,8 @@ float GraphGroup::executeAndCollectNorm(const std::functionget("normalize-gradient")) normalizationFactor *= updateTrgWords; @@ -207,9 +210,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) return normalizationFactor; if(dynamicGradientScaling_) { - // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g) - if(costScale_) - gNorm = gNorm / costScaleFactor_; + // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g) + if(costScaling_) + gNorm = gNorm / costScalingFactor_; // Normalize gradient norm w.r.t. number of labels in batch for statistics, // there should be no gradient normalization before this point, @TODO: check this @@ -231,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average. auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms. + float fadeoutMultiplier = 1.f; + if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates + fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_; + + float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time. // delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations // hence rescale gradient using the average. - if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) { - LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}", - dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform); + if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) { + if(isMainProcess()) + LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}", + dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm); normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. } @@ -288,9 +297,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { restoreFromCheckpoint(modelFileName, scatterFn); } else if(options_->hasAndNotEmpty("pretrained-model")) { std::string nameInit = options_->get("pretrained-model"); - LOG(info, - "[training] Initializing model weights with pre-trained model {}", - nameInit); + LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit); size_t i = 0; for(auto graph : graphs_) diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 0e4a68dcc..9f1362e75 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -60,21 +60,21 @@ class GraphGroup { double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false - bool costScale_{false}; - float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization - size_t costScaleFreq_{2000}; - float costScaleMultiplier_{2.f}; - float costScaleNanTolerance_{0.f}; - size_t costScaleNanRange_{1}; - float costScaleFactorMinimum_{1.f}; // @TODO make this configureable + bool costScaling_{false}; + float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization + size_t costScalingFreq_{2000}; + float costScalingMultiplier_{2.f}; + float costScalingFactorMinimum_{1.f}; + size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization size_t nanSeen_{0}; + bool checkGradientNan_{false}; + bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; - - bool checkGradientNan_{false}; + size_t dynamicGradientScalingFadeout_{0ul}; // determines the number of input streams (i.e. input files or fields in the TSV input) that need // to be included in the batch, i.e. without alignments and weights diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp index 72b06e489..f85f9cf85 100644 --- a/src/training/graph_group_async.cpp +++ b/src/training/graph_group_async.cpp @@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr batch) { thread_local Tensor accGradients; thread_local Ptr accAlloc; - ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD"); + ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD"); auto graph = graphs_[tid]; Ptr dynamicLoss = models_[tid]->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // it's ok to go out of scope, this will still insert the new top node into the graph - auto costNode = dynamicLoss->loss() * costScaleFactor_; + auto costNode = dynamicLoss->loss() * costScalingFactor_; } if(t % optimizerDelay_ == 0) { diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp index 7dc861375..162610705 100644 --- a/src/training/graph_group_singleton.cpp +++ b/src/training/graph_group_singleton.cpp @@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr batch) { auto opt = optimizerShards_[0]; auto lossNode = model->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything - auto scaledLoss = lossNode->loss() * costScaleFactor_; + auto scaledLoss = lossNode->loss() * costScalingFactor_; } graph->forward(); graph->backward(); bool noNanOrInf = true; - if(costScale_) { + if(costScaling_) { // Are there NaNs in the gradient? bool hasNan = false, hasInf = false; IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf); @@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr batch) { opt->update(graph->params()->vals(), graph->params()->grads(), batch->wordsTrg(), - costScaleFactor_); + costScalingFactor_); if(scheduler_) { scheduler_->update(*lossNode, batch); diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index 8c06761e1..c90a384e4 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num { // let loss go out of scope, frees memory auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch); - if(costScaleFactor_ != 1.f) - rationalLoss->loss() * costScaleFactor_; + if(costScalingFactor_ != 1.f) + rationalLoss->loss() * costScalingFactor_; graph->forward(); localDeviceLosses[localDeviceIndex] += *rationalLoss; @@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num graph->backward(/*zero=*/false); // (gradients are reset before we get here) } -#if 1 +#if 0 // @TODO: this can probably be removed now, keep around until confirmed. // experimental and should eventually be somewhere else // Handle local gradient explosion but only clip to largest possible value // given number of GPUs and type. Should clip rarely. Also clips inf @@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards float gradNorm = 0.f; - if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) { + if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) { // Wrapping member function auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) { return GraphGroup::checkNanOrNorm(i, begin, end); diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 2a0d3947a..580895f2f 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); @@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr graph, Ptr auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step // main loop over output time steps for (size_t t = 0; ; t++) { + //std::cerr << "\nstep=" << t << std::endl; ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??"); // determine beam size for next output time step, as max over still-active sentences // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp index 237d9b9da..dbcceec47 100644 --- a/src/translator/nth_element.cpp +++ b/src/translator/nth_element.cpp @@ -3,7 +3,9 @@ * SPDX-License-Identifier: MIT */ +#include "common/utils.h" #include "translator/nth_element.h" + #include #include #include diff --git a/src/translator/translator.h b/src/translator/translator.h index 4084ced95..0621fc8ce 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -122,7 +122,7 @@ class Translate : public ModelTask { threadPool.enqueue(task, device, id++); } - if(options_->get("output-sampling", false)) { + if(options_->hasAndNotEmpty("output-sampling")) { if(options_->get("beam-size") > 1) LOG(warn, "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods " From aafe8fb5ca8f613f52da7589a72e1a647d51f820 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Feb 2022 02:36:20 -0800 Subject: [PATCH 142/254] update regression tests pointer --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 32a2f7960..0716f4e01 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 32a2f7960d8cc48d6c90cbb5d03fbb42eb923d3d +Subproject commit 0716f4e012d1e3f7543bffa8aecc97ce9c903e17 From a365bb5ce99135eab29ffe378e0c6c9fb9bf0c1b Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Feb 2022 08:09:54 -0800 Subject: [PATCH 143/254] fix server behaviour --- src/data/text_input.cpp | 4 ++-- src/data/text_input.h | 6 ++---- src/translator/translator.h | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index b1f4cdd47..196cf421a 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -13,7 +13,7 @@ void TextIterator::increment() { } bool TextIterator::equal(TextIterator const& other) const { - return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty()); + return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); } const SentenceTuple& TextIterator::dereference() const { @@ -59,7 +59,7 @@ SentenceTuple TextInput::next() { if(tup.size() == files_.size()) // check if each input file provided an example return SentenceTuple(tup); else if(tup.size() == 0) // if no file provided examples we are done - return SentenceTuple(); + return SentenceTupleImpl(); // return an empty tuple if above test does not pass(); else // neither all nor none => we have at least on missing entry ABORT("There are missing entries in the text tuples."); } diff --git a/src/data/text_input.h b/src/data/text_input.h index b08a4fdcc..98d991bcb 100644 --- a/src/data/text_input.h +++ b/src/data/text_input.h @@ -37,12 +37,10 @@ class TextInput : public DatasetBase { bool maxLengthCrop_{false}; public: - typedef SentenceTuple Sample; - TextInput(std::vector inputs, std::vector> vocabs, Ptr options); virtual ~TextInput() {} - Sample next() override; + SentenceTuple next() override; void shuffle() override {} void reset() override {} @@ -52,7 +50,7 @@ class TextInput : public DatasetBase { // TODO: There are half dozen functions called toBatch(), which are very // similar. Factor them. - batch_ptr toBatch(const std::vector& batchVector) override { + batch_ptr toBatch(const std::vector& batchVector) override { size_t batchSize = batchVector.size(); std::vector sentenceIds; diff --git a/src/translator/translator.h b/src/translator/translator.h index 0621fc8ce..75b5070b3 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -330,7 +330,7 @@ class TranslateService : public ModelServiceTask { ? convertTsvToLists(input, options_->get("tsv-fields", 1)) : std::vector({input}); auto corpus_ = New(inputs, srcVocabs_, options_); - data::BatchGenerator batchGenerator(corpus_, options_); + data::BatchGenerator batchGenerator(corpus_, options_, nullptr, /*runAsync=*/false); auto collector = New(options_->get("quiet-translation", false)); auto printer = New(options_, trgVocab_); From 05ba9e4c319db2317319227f5706f634340e0db4 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 8 Feb 2022 02:57:20 -0800 Subject: [PATCH 144/254] add -DDETERMINISTIC=ON/OFF flag (#912) * Add -DDETERMINISTIC=ON/OFF flag to CMake * Use -DDETERMINISTIC=on in GitHub/Azure workflows Co-authored-by: Roman Grundkiewicz --- .github/workflows/ubuntu.yml | 1 + .github/workflows/windows.yml | 2 ++ CMakeLists.txt | 12 +++++++++++- azure-pipelines.yml | 4 +++- src/common/config_parser.cpp | 10 ++++++++++ src/tensors/gpu/tensor_operators.cu | 2 +- 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a7f233ca6..4a0fa6746 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -98,6 +98,7 @@ jobs: -DCOMPILE_SERVER=on \ -DCOMPILE_TESTS=${{ matrix.unit_tests }} \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \ + -DDETERMINISTIC=on \ -DUSE_FBGEMM=${{ matrix.cpu }} \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=on \ diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index dd10c733d..ee85f303d 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -81,6 +81,7 @@ jobs: -DCOMPILE_CUDA="${{ matrix.gpu }}" -DCOMPILE_SERVER="FALSE" -DCOMPILE_TESTS="TRUE" + -DDETERMINISTIC="TRUE" -DUSE_FBGEMM="TRUE" -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" @@ -110,6 +111,7 @@ jobs: -DCOMPILE_CUDA="${{ matrix.gpu }}" -DCOMPILE_SERVER="FALSE" -DCOMPILE_TESTS="TRUE" + -DDETERMINISTIC="TRUE" -DUSE_FBGEMM="TRUE" -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" diff --git a/CMakeLists.txt b/CMakeLists.txt index eb6ca97b2..7c41b365c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(USE_NCCL "Use NCCL library" ON) option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) +option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF) # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them, # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12, @@ -571,6 +572,15 @@ if(USE_STATIC_LIBS) set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) endif() +if(DETERMINISTIC) + message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down") + add_definitions(-DDETERMINISTIC=1) + list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; ) +else() + add_definitions(-DDETERMINISTIC=0) + list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=0; ) +endif() + # Find MPI if(USE_MPI) # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version @@ -580,7 +590,7 @@ if(USE_MPI) include_directories(${MPI_INCLUDE_PATH}) set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES}) if(USE_STATIC_LIBS) # alternatively this could install OpenMPI like NCCL and link against that statically with greater control - message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on") + message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on") endif(USE_STATIC_LIBS) add_definitions(-DMPI_FOUND=1) endif(MPI_FOUND) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bc76f85c9..0348ebb42 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -470,7 +470,7 @@ stages: # Marian is built in the same job where the regression tests are run to make sure that executables -# is compiled and run on a machine with the same CPU architecture, which is required for +# are compiled and run on a machine with the same CPU architecture, which is required for # compilations with FBGEMM. - stage: Tests jobs: @@ -530,6 +530,7 @@ stages: -DCMAKE_MAKE_PROGRAM="ninja.exe" ^ -DCMAKE_TOOLCHAIN_FILE="$(VCPKG_DIR)\scripts\buildsystems\vcpkg.cmake" ^ -DVCPKG_TARGET_TRIPLET="x64-windows-static" ^ + -DDETERMINISTIC="TRUE" ^ ^ -DCOMPILE_CPU="TRUE" ^ -DCOMPILE_CUDA="FALSE" ^ @@ -634,6 +635,7 @@ stages: -DCMAKE_BUILD_TYPE=slim \ -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ + -DDETERMINISTIC=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=on diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index ebbe4a89a..837bee53f 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -897,8 +897,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { cli.add("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); +#if DETERMINISTIC cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); +#else + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 8); +#endif // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add("--all-caps-every", @@ -919,8 +924,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); } else { +#if DETERMINISTIC cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); +#else + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 8); +#endif } // clang-format on } diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 2103ca9de..9011f284a 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1163,7 +1163,7 @@ void PasteRows(Tensor out, size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); -#if 0 // @TODO: make this configurable with a 'deterministic' flag +#if DETERMINISTIC // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be From 8e659bb5c06951695259d5b4ec743cbee7a32134 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Tue, 8 Feb 2022 10:58:09 +0000 Subject: [PATCH 145/254] Document Structure (#910) * Add architectural outline * Update index --- doc/index.rst | 1 + doc/structure.md | 143 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 doc/structure.md diff --git a/doc/index.rst b/doc/index.rst index a790e6247..d19bb4b00 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -12,6 +12,7 @@ This is developer documentation. User documentation is available at https://mari :maxdepth: 2 :caption: Contents: + structure graph operators layer diff --git a/doc/structure.md b/doc/structure.md new file mode 100644 index 000000000..b2bd16084 --- /dev/null +++ b/doc/structure.md @@ -0,0 +1,143 @@ +# Code Organisation + +This purpose of this document is to outline the organisational structure of the Marian codebase. Each section of this document approaches an architectural component and highlights a subset of directories that are relevant to it. + + +## Operating Modes +``` +marian/src +├── command +├── rescorer +├── training +└── translator +``` +The Marian toolkit provides several commands, covering different modes of operation. These are: + - `marian` + - `marian-decoder` + - `marian-server` + - `marian-scorer` + - `marian-vocab` + - `marian-conv` + +Each of which has a corresponding file in the `command` directory. + +The main `marian` command is capable of running all other modes (except server), see `marian-main.cpp` for the implementation. By default, it operates in `train` mode and corresponds to `marian-train.cpp`. Other modes may be accessed by calling `marian ` instead of `marian-`. + +Training is covered by the main `marian` command, with relevant implementation details kept inside the `training` subdirectory. Translation is facilitated by code in the `translator` subdirectory and is handled by the `marian-decoder` command, as well as `marian-server` which provides a web-socket service. `marian-scorer` is the tool used to re-score parallel inputs or n-best lists, and uses code in the `rescorer` subdirectory. + +The remaining commands `marian-vocab` and `marian-conv` provide useful auxiliary functions. `marian-vocab` is a tool to create a vocabulary file from a given text corpus. This uses components described in the Data section of this document. +`marian-conv` exists to convert Marian model files from `.npz`, `.bin` as well as lexical shortlists to binary shortlists. It is also possible to use this command to emit an ONNX-compliant model representation. In addition to components defined in the Data section, this also makes use of Model specific components. + +Finally, the implementation of the command-line-interface for these commands is described in the Utility section. + + +## Data +``` +marian/src +└── data +``` +Data refers to the handling and representation of the text input to Marian. +This consists of source code for the representation of the corpus, vocabulary and batches. + +Internally, tokens are represented as indices, or `Words`; some indices are reserved for special tokens, such as `EOS`, `UNK`. Vocabulary implementations are responsible for encoding and decoding sentences to and from the internal representation, whether that be a SentencePiece, Factors or Plain Text/YAML defined vocabulary file. + +This directory is also responsible for generating batches from a corpus and performing any shuffling of the corpus or batches, as requested. Furthermore, when using a shortlist, their behaviour is also defined here. + +Once the batches are generated they are passed as input to the expression graph. + + +## Expression Graph +``` +marian/src +├── functional +├── graph +├── optimizers +└── tensors +``` + +Marian implements a reverse-mode auto-differentiation computation graph. The relevant components reside in these subdirectories. The `graph` subdirectory concerns the structure of the graph, its nodes: operators, parameters and constants, as well as how to traverse it, both forwards and backwards. Moreover, it defines the APIs for operations that the graph is able to perform. + +The `tensors` and `functional` subdirectories contain the implementation of operations for the graph. + +One component of the `functional` subdirectory describes how functions operate on the underlying data types. This is a combination of standard operations on fundamental types, and SIMD intrinsics on extended types where available. The `functional` namespace also provides useful abstractions that enable generic formulas to be written. It defines variable-like objects `_1,_2`, such that `_1 * cos(_2)` represents the product of the argument at index 1 with the cosine of the argument at index 2. + +The `tensors` subdirectory contains the definition of a tensor object. In Marian, a tensor is a piece of memory which is ascribed a shape and type which is associated with a backend (the compute device). +This directory also contains the implementations of tensor operations on CPU and GPU, as well as universal functions that dispatches the call to the relevant device. + +More specific documentation is available that describes the [graph][graph], and how its [operators][graph_ops] are implemented. + + +## Model +``` +marian/src +├── models +├── layers +└── rnn +``` +The subdirectories above constitute the components of a Model. There are two main types of model: + - `IModel`, which maps inputs to predictions + - `ICriterionFunction`, which maps (inputs, references) to losses + +The usage of these interfaces sometimes combined. As an example, `Trainer`, an implementation of the `ICriterionFunction` interface used in training contains an `IModel` member from which it then computes the loss. + +An important specialisation of `IModel` is `IEncoderDecoder`, this specifies the interface for the `EncoderDecoder` class. `EncoderDecoder` consists of a set of Encoders and Decoders objects, which implement the interface of `EncoderBase` and `DecoderBase`, respectively. This composite object defines the behaviour of general Encoder-Decoder models. For instance, the `s2s` models implement a `EncoderS2S` and `DecoderS2S`, while `transformer` models implement a `EncoderTransformer` `DecoderTransformer`. These two use cases are both encapsulated in the `EncoderDecoder` framework. The addition of new encoder-decoder models only need implement their encoder and decoder classes. The `EncoderDecoder` models are constructed using a factory pattern in `src/models/model_factory.cpp`. + +The export of an ONNX-compliant model is handled by code here. +``` +marian/src +└── onnx +``` + + +## Utility +``` +marian/src +└── common +``` +The `common` subdirectory contains many useful helper functions and classes. +The majority of which fall under one of these categories: + - Command-line interface definition an Options object + - Definitions, macros and typedefs + - Filesystem and IO helpers + - Logging + - Memory management + - Signal handling + - Text manipulation + - Type-based dispatching and properties + +Beyond these areas, this folder also contains metadata, such as the program version, list of contributors, and the build flags used to compile it. + + +## External Libraries +``` +marian/src +└── 3rd_party + ``` +Many of the external libraries that Marian depends on are contained in `3rd_party`. + +These libraries are either copied into place here and version-controlled via the marian repository, or are included here as a submodule. Of these submodules, many have been forked and are maintained under the marian-nmt organisation. + + +## Tests and Examples +``` +marian/src +├── examples +└── tests +``` +There are basic tests and examples contained in `marian/src`. + +The unit tests cover basic graph functionality, checks on the output of operators, and the implementation of RNN attention, as well IO of binary files and manipulation of the options structure. + +The examples in this subdirectory demonstrate Marian's functionality using common datasets: Iris and MNIST. The Iris example, builds a simple dense feedforward network to perform a classification task. Over 200 epochs, it trains the network on target using mean cross-entropy. It then reports the accuracy of the model on the test-set. The MNIST example showcases more advanced features of Marian. It offers a choice of models (FFNN, LeNet), can leverage multi-device environments and uses a validator during training. This example more closely replicates the workflow of a typical Marian model, with batching of data and a model implemented in terms of Marian's model interfaces. + +``` +marian +├── examples +└── regression-tests +``` +Further tests and examples are contained in the root of the marian source code. The examples here are end-to-end tutorials on how to use Marian. These range from covering the basics of training a Marian model, to replicating the types of models presented at the Conference on Machine Translation (WMT). +Similarly, the tests in `regression-tests` are more numerous and detailed. They cover some 250+ areas of the code. While the unit tests described above check basic consistency of certain functions, the regression tests offer end-to-end verification of the functionality of Marian. + + +[graph]: https://marian-nmt.github.io/docs/api/graph.html +[graph_ops]: https://marian-nmt.github.io/docs/api/operators.html From f00d0621897ecf5dc947bba186d3d5fc8434fba2 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 8 Feb 2022 08:39:24 -0800 Subject: [PATCH 146/254] update VERSION and CHANGELOG - Release 1.11.0 --- CHANGELOG.md | 8 ++++++++ VERSION | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf0f38445..ea3935b39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +### Fixed + +### Changed + +## [1.11.0] - 2022-02-08 + ### Added - Parallelized data reading with e.g. `--data-threads 8` - Top-k sampling during decoding with e.g. `--output-sampling topk 10` diff --git a/VERSION b/VERSION index e4afc5eb8..cd74ac3b5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.10.42 +v1.11.0 From bcf29b8cd2d171af8df5d5f12c903e7ec94437cc Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Wed, 9 Feb 2022 17:05:48 +0000 Subject: [PATCH 147/254] Update acknowledgements (#914) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 69ae220cb..7fa003e19 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,8 @@ _Horizon 2020 Research and Innovation Programme_ under grant agreements 644333 ([TraMOOC](http://tramooc.eu/); 2015-2017), 644402 ([HiML](http://www.himl.eu/); 2015-2017), 825303 ([Bergamot](https://browser.mt/); 2019-2021), +the European Union's Connecting Europe Facility project +2019-EU-IA-0045 ([User-focused Marian](https://marian-project.eu); 2020-2022), the Amazon Academic Research Awards program, the World Intellectual Property Organization, and is based upon work supported in part by the Office of the Director of From b97645846aaf3dacf6e2e6c7ea20f8c49fb8e95d Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Wed, 9 Feb 2022 18:56:56 +0000 Subject: [PATCH 148/254] Update release workflow (#915) * Add CUDA 11.x to Windows installation script * Update release.yml workflow --- .github/workflows/release.yml | 34 ++++++++++++++--------------- scripts/ci/install_cuda_windows.ps1 | 5 ++++- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 636340b43..8a3761e3b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -29,7 +29,7 @@ on: - "[0-9]+.[0-9]+.[0-9]+*" env: - cuda_version: "10.2" + cuda_version: "11.2" gcc_version: 8 jobs: @@ -45,7 +45,7 @@ jobs: fbgemm: false cuda: false - name: "Build Ubuntu CPU+CUDA" - suffix: cuda10.2 + suffix: cuda11.2 fbgemm: false cuda: true @@ -59,8 +59,8 @@ jobs: # Get the tag name only to use it in the archive name. The variable github.ref can not be used because it starts with refs/tags/ TAG_NAME=$(echo ${{ github.ref }} | cut -d/ -f3-) # https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable - echo "::set-env name=github_tag_name::${TAG_NAME}" - echo "::set-env name=archive_name::${{ github.event.repository.name }}-${TAG_NAME}_linux-x64-static_${{ matrix.suffix }}" + echo "github_tag_name=${TAG_NAME}" >> $GITHUB_ENV + echo "archive_name=${{ github.event.repository.name }}-${TAG_NAME}_linux-x64-static_${{ matrix.suffix }}" >> $GITHUB_ENV shell: bash - name: Checkout @@ -72,7 +72,7 @@ jobs: # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev # Boost is no longer pre-installed on GitHub-hosted runners - name: Install dependencies - run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev + run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev gcc-${{ env.gcc_version }} g++-${{ env.gcc_version }} # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL @@ -133,7 +133,7 @@ jobs: fbgemm: false cuda: false - name: "Build Windows CPU+CUDA" - suffix: cuda10.2 + suffix: cuda11.2 fbgemm: false cuda: true @@ -147,8 +147,8 @@ jobs: # Get the tag name only to use it in the archive name. The variable github.ref can not be used because it starts with refs/tags/ TAG_NAME=$(echo ${{ github.ref }} | cut -d/ -f3-) # https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable - echo "::set-env name=github_tag_name::${TAG_NAME}" - echo "::set-env name=archive_name::${{ github.event.repository.name }}-${TAG_NAME}_windows-x64_${{ matrix.suffix }}" + echo "github_tag_name=${TAG_NAME}" >> $GITHUB_ENV + echo "archive_name=${{ github.event.repository.name }}-${TAG_NAME}_windows-x64_${{ matrix.suffix }}" >> $GITHUB_ENV shell: bash - name: Checkout @@ -162,20 +162,20 @@ jobs: Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl # Set the MKLROOT environment variable so that CMake can find MKL. # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners - echo "::set-env name=MKLROOT::$env:GITHUB_WORKSPACE/mkl" - shell: powershell + echo "MKLROOT=$env:GITHUB_WORKSPACE/mkl" | Out-File -FilePath $env:GITHUB_ENV -Append + shell: pwsh - name: Install CUDA run: | .\scripts\ci\install_cuda_windows.ps1 '${{ env.cuda_version }}' # Set path to CUDA for subsequent steps so that CMake can find it - echo "::set-env name=CUDA_PATH::$env:CUDA_PATH" - echo "::add-path::$env:CUDA_PATH/bin" - shell: powershell + echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Append + echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Append + shell: pwsh if: matrix.cuda == true - name: Prepare vcpkg - uses: lukka/run-vcpkg@v2 + uses: lukka/run-vcpkg@v4 with: vcpkgArguments: protobuf vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da @@ -184,7 +184,7 @@ jobs: # Build with a simplified CMake settings JSON file - name: Run CMake - uses: lukka/run-cmake@v2 + uses: lukka/run-cmake@v3 with: buildDirectory: ${{ github.workspace }}/build/ cmakeAppendedArgs: '-G Ninja @@ -207,14 +207,14 @@ jobs: run: | cp ../README.md . Compress-Archive -Path marian*.exe,README.md -DestinationPath ../${{ env.archive_name }}.zip - shell: powershell + shell: pwsh working-directory: build # For testing only #- name: Test archive #run: | #Compress-Archive -Path README.md -DestinationPath ${{ env.archive_name }}.zip - #shell: powershell + #shell: pwsh - name: Upload archive uses: actions/upload-artifact@v2 diff --git a/scripts/ci/install_cuda_windows.ps1 b/scripts/ci/install_cuda_windows.ps1 index a6d09ebc2..78b551be7 100644 --- a/scripts/ci/install_cuda_windows.ps1 +++ b/scripts/ci/install_cuda_windows.ps1 @@ -29,7 +29,10 @@ $CUDA_KNOWN_URLS = @{ "10.0" = "http://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network"; "10.1" = "http://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe"; "10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"; - "11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.1/network_installers/cuda_11.0.1_win10_network.exe" + "11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe"; + "11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe"; + "11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe"; + "11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.0/network_installers/cuda_11.3.0_win10_network.exe" } ## ------------------------------------------------------------------ From 73f18993074e899ca89775e7ac6497d63d45136c Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 10 Feb 2022 10:25:08 +0000 Subject: [PATCH 149/254] Add dependabot for git submodules (#916) --- .github/dependabot.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..4172ac3db --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 + +updates: + # Maintain dependencies for Git Submodules + - package-ecosystem: "gitsubmodule" + directory: "/" + schedule: + interval: "daily" From a492bc57d25af84779995bc7a066044f545b6217 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Feb 2022 10:28:04 +0000 Subject: [PATCH 150/254] Bump regression-tests from `0716f4e` to `f7971b7` (#918) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `0716f4e` to `f7971b7`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/0716f4e012d1e3f7543bffa8aecc97ce9c903e17...f7971b790abac39e557346bd5907c693d4939778) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 0716f4e01..f7971b790 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 0716f4e012d1e3f7543bffa8aecc97ce9c903e17 +Subproject commit f7971b790abac39e557346bd5907c693d4939778 From 4d44627f26430f45143c9eacbd7a9224da02a851 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Thu, 10 Feb 2022 11:20:27 +0000 Subject: [PATCH 151/254] PyYaml safe_load instead of load (#913) * pyyaml safe_load instead of load * Update CHANGELOG --- CHANGELOG.md | 1 + scripts/bert/bert4marian.py | 2 +- scripts/contrib/model_info.py | 4 ++-- scripts/embeddings/export_embeddings.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea3935b39..90f913c40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed diff --git a/scripts/bert/bert4marian.py b/scripts/bert/bert4marian.py index 8070c0fe9..9ef3da93a 100755 --- a/scripts/bert/bert4marian.py +++ b/scripts/bert/bert4marian.py @@ -31,7 +31,7 @@ args = parser.parse_args() print("Loading TensorFlow config from %s" % (args.bert_config,)) -bertConfig = yaml.load(open(args.bert_config)) +bertConfig = yaml.safe_load(open(args.bert_config)) bertConfigYamlStr = yaml.dump(bertConfig, default_flow_style=False) print(bertConfigYamlStr) diff --git a/scripts/contrib/model_info.py b/scripts/contrib/model_info.py index 9e9647efe..1a022e188 100755 --- a/scripts/contrib/model_info.py +++ b/scripts/contrib/model_info.py @@ -27,9 +27,9 @@ def main(): # fix the invalid trailing unicode character '#x0000' added to the YAML # string by the C++ cnpy library try: - yaml_node = yaml.load(yaml_text) + yaml_node = yaml.safe_load(yaml_text) except yaml.reader.ReaderError: - yaml_node = yaml.load(yaml_text[:-1]) + yaml_node = yaml.safe_load(yaml_text[:-1]) print(yaml_node[args.key]) else: diff --git a/scripts/embeddings/export_embeddings.py b/scripts/embeddings/export_embeddings.py index 3b4f3314b..f2f6031e2 100755 --- a/scripts/embeddings/export_embeddings.py +++ b/scripts/embeddings/export_embeddings.py @@ -18,7 +18,7 @@ def main(): print("Loading model") model = np.load(args.model) - special = yaml.load(model["special:model.yml"][:-1].tobytes()) + special = yaml.safe_load(model["special:model.yml"][:-1].tobytes()) if special["tied-embeddings-all"] or special["tied-embeddings-src"]: all_emb = model["Wemb"] From 17e55f5a7db3be0dae3273acc02a06d6869f9b31 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 10 Feb 2022 11:20:47 +0000 Subject: [PATCH 152/254] Update VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index cd74ac3b5..65b4811df 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.0 +v1.11.1 From 8fd553e5826d1a95feed7a8e9bff0803f2fcc8f9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Feb 2022 14:03:37 +0000 Subject: [PATCH 153/254] Bump examples from `6d5921c` to `0ca966e` (#919) Bumps [examples](https://github.com/marian-nmt/marian-examples) from `6d5921c` to `0ca966e`. - [Release notes](https://github.com/marian-nmt/marian-examples/releases) - [Commits](https://github.com/marian-nmt/marian-examples/compare/6d5921cc7de91f4e915b59e9c52c9a76c4e99b00...0ca966eadd2a4885a10d41e0f2f51445ab6fd038) --- updated-dependencies: - dependency-name: examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 6d5921cc7..0ca966ead 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 6d5921cc7de91f4e915b59e9c52c9a76c4e99b00 +Subproject commit 0ca966eadd2a4885a10d41e0f2f51445ab6fd038 From e6dbacb3109726333d3e8c6e3c4d170cc06b7f05 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 10 Feb 2022 16:30:21 +0000 Subject: [PATCH 154/254] Merged PR 22490: Faster LSH top-k for CPU This PR replaces the top-k search from FAISS on the CPU with a more specialized version for discrete distances in sub-linear time. --- src/command/marian_conv.cpp | 11 ++- src/layers/lsh.cpp | 88 +++++++++-------- src/layers/lsh.h | 24 +++-- src/layers/lsh_impl.h | 186 ++++++++++++++++++++++++++++++++++++ src/microsoft/quicksand.cpp | 7 +- 5 files changed, 260 insertions(+), 56 deletions(-) create mode 100644 src/layers/lsh_impl.h diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index 943f61d48..b4a5f3745 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -86,11 +86,17 @@ int main(int argc, char** argv) { graph->setDevice(CPU0); graph->load(modelFrom); + std::vector toBeLSHed; if(addLsh) { // Add dummy parameters for the LSH before the model gets actually initialized. // This create the parameters with useless values in the tensors, but it gives us the memory we need. + toBeLSHed = { + {lshOutputWeights, "lsh_output_codes", "lsh_output_rotation", lshNBits} + }; + graph->setReloaded(false); - lsh::addDummyParameters(graph, /*weights=*/lshOutputWeights, /*nBits=*/lshNBits); + for(auto p : toBeLSHed) + lsh::addDummyParameters(graph, /*paramInfo=*/p); graph->setReloaded(true); } @@ -99,7 +105,8 @@ int main(int argc, char** argv) { if(addLsh) { // After initialization, hijack the paramters for the LSH and force-overwrite with correct values. // Once this is done we can just pack and save as normal. - lsh::overwriteDummyParameters(graph, /*weights=*/lshOutputWeights); + for(auto p : toBeLSHed) + lsh::overwriteDummyParameters(graph, /*paramInfo=*/p); } // added a flag if the weights needs to be packed or not diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index 8a9c924ee..73d45fc71 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -3,12 +3,14 @@ #include "common/utils.h" #include "3rd_party/faiss/utils/hamming.h" -#include "3rd_party/faiss/Index.h" #if BLAS_FOUND #include "3rd_party/faiss/VectorTransform.h" #endif +#include "common/timer.h" + +#include "layers/lsh_impl.h" namespace marian { namespace lsh { @@ -98,24 +100,22 @@ Expr encode(Expr input, Expr rotation) { return lambda(inputs, encodedShape, Type::uint8, encodeFwd, encodeHash); } -Expr rotator(Expr weights, int nBits) { +Expr rotator(Expr weights, int inDim, int nBits) { auto rotator = [](Expr out, const std::vector& inputs) { inputs; fillRandomRotationMatrix(out->val(), out->graph()->allocator()); }; static const size_t rotatorHash = (size_t)&rotator; - int dim = weights->shape()[-1]; - return lambda({weights}, {dim, nBits}, Type::float32, rotator, rotatorHash); + return lambda({weights}, {inDim, nBits}, Type::float32, rotator, rotatorHash); } -Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows) { +Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNRows, bool noSort/*= false*/) { ABORT_IF(encodedQuery->shape()[-1] != encodedWeights->shape()[-1], "Query and index bit vectors need to be of same size ({} != {})", encodedQuery->shape()[-1], encodedWeights->shape()[-1]); int currBeamSize = encodedQuery->shape()[0]; int batchSize = encodedQuery->shape()[2]; - int numHypos = currBeamSize * batchSize; auto search = [=](Expr out, const std::vector& inputs) { Expr encodedQuery = inputs[0]; @@ -128,30 +128,25 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows if(firstNRows != 0) wRows = firstNRows; - int qRows = encodedQuery->shape().elements() / bytesPerVector; - - uint8_t* qCodes = encodedQuery->val()->data(); - uint8_t* wCodes = encodedWeights->val()->data(); - - // use actual faiss code for performing the hamming search. - std::vector distances(qRows * k); - std::vector ids(qRows * k); - faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)k, ids.data(), distances.data()}; - faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0); - - // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis. - // The sorting is required as we later do a binary search on those values for reverse look-up. - uint32_t* outData = out->val()->data(); - for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { - size_t startIdx = k * hypoIdx; - size_t endIdx = startIdx + k; - for(size_t i = startIdx; i < endIdx; ++i) - outData[i] = (uint32_t)ids[i]; - std::sort(outData + startIdx, outData + endIdx); - } + ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently? + + IndexType* outData = out->val()->data(); + auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { + outData[rowId * dimK + k] = kthColId; + }; + + Parameters params; + params.k = dimK; + params.queryRows = encodedQuery->val()->data(); + params.numQueryRows = encodedQuery->shape().elements() / bytesPerVector; + params.codeRows = encodedWeights->val()->data(); + params.numCodeRows = wRows; + params.bytesPerVector = bytesPerVector; + + hammingTopK(params, gather); }; - Shape kShape({currBeamSize, batchSize, k}); + Shape kShape({currBeamSize, batchSize, dimK}); return lambda({encodedQuery, encodedWeights}, kShape, Type::uint32, search); } @@ -166,7 +161,7 @@ Expr search(Expr query, Expr weights, int k, int nBits, int firstNRows, bool abo } else { ABORT_IF(abortIfDynamic, "Dynamic creation of LSH rotation matrix prohibited"); LOG_ONCE(info, "Creating ad-hoc rotation matrix with shape {}", Shape({dim, nBits})); - rotMat = rotator(weights, nBits); + rotMat = rotator(weights, dim, nBits); } } @@ -195,34 +190,43 @@ Ptr randomRotation() { return New(); } -void addDummyParameters(Ptr graph, std::string weightsName, int nBitsRot) { - auto weights = graph->get(weightsName); - - ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", weightsName); +void addDummyParameters(Ptr graph, ParamConvInfo paramInfo) { + auto weights = graph->get(paramInfo.name); + int nBitsRot = paramInfo.nBits; + + ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", paramInfo.name); int nBits = weights->shape()[-1]; + if(paramInfo.transpose) + nBits = weights->shape()[-2]; + int nRows = weights->shape().elements() / nBits; Expr rotation; if(nBits != nBitsRot) { - LOG(info, "Adding LSH rotation parameter lsh_output_rotation with shape {}", Shape({nBits, nBitsRot})); - rotation = graph->param("lsh_output_rotation", {nBits, nBitsRot}, inits::dummy(), Type::float32); + LOG(info, "Adding LSH rotation parameter {} with shape {}", paramInfo.rotationName, Shape({nBits, nBitsRot})); + rotation = graph->param(paramInfo.rotationName, {nBits, nBitsRot}, inits::dummy(), Type::float32); nBits = nBitsRot; } int bytesPerVector = lsh::bytesPerVector(nBits); - LOG(info, "Adding LSH encoded weights lsh_output_codes with shape {}", Shape({nRows, bytesPerVector})); - auto codes = graph->param("lsh_output_codes", {nRows, bytesPerVector}, inits::dummy(), Type::uint8); + LOG(info, "Adding LSH encoded weights {} with shape {}", paramInfo.codesName, Shape({nRows, bytesPerVector})); + auto codes = graph->param(paramInfo.codesName, {nRows, bytesPerVector}, inits::dummy(), Type::uint8); } -void overwriteDummyParameters(Ptr graph, std::string weightsName) { - Expr weights = graph->get(weightsName); - Expr codes = graph->get("lsh_output_codes"); - Expr rotation = graph->get("lsh_output_rotation"); +void overwriteDummyParameters(Ptr graph, ParamConvInfo paramInfo) { + Expr weights = graph->get(paramInfo.name); + Expr codes = graph->get(paramInfo.codesName); + Expr rotation = graph->get(paramInfo.rotationName); - ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", weightsName); + ABORT_IF(!weights, "Trying to encode non-existing weights matrix {}??", paramInfo.name); ABORT_IF(!codes, "Trying to overwrite non-existing LSH parameters lsh_output_codes??"); + if(paramInfo.transpose) { + weights = transpose(weights); + graph->forward(); + } + if(rotation) { fillRandomRotationMatrix(rotation->val(), weights->graph()->allocator()); encodeWithRotation(codes->val(), weights->val(), rotation->val(), weights->graph()->allocator()); diff --git a/src/layers/lsh.h b/src/layers/lsh.h index 7a5858914..5065ffcfd 100644 --- a/src/layers/lsh.h +++ b/src/layers/lsh.h @@ -17,26 +17,34 @@ namespace marian { namespace lsh { - - // return the number of full bytes required to encoded that many bits - int bytesPerVector(int nBits); - // encodes an input as a bit vector, with optional rotation Expr encode(Expr input, Expr rotator = nullptr); // compute the rotation matrix (maps weights->shape()[-1] to nbits floats) - Expr rotator(Expr weights, int nbits); + Expr rotator(Expr weights, int inDim, int nbits); // perform the LSH search on fully encoded input and weights, return k results (indices) per input row // @TODO: add a top-k like operator that also returns the bitwise computed distances - Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows = 0); + Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int k, int firstNRows = 0, bool noSort = false); // same as above, but performs encoding on the fly Expr search(Expr query, Expr weights, int k, int nbits, int firstNRows = 0, bool abortIfDynamic = false); + // struct for parameter conversion used in marian-conv + struct ParamConvInfo { + std::string name; + std::string codesName; + std::string rotationName; + int nBits; + bool transpose; + + ParamConvInfo(const std::string& name, const std::string& codesName, const std::string& rotationName, int nBits, bool transpose = false) + : name(name), codesName(codesName), rotationName(rotationName), nBits(nBits), transpose(transpose) {} + }; + // These are helper functions for encoding the LSH into the binary Marian model, used by marian-conv - void addDummyParameters(Ptr graph, std::string weightsName, int nBits); - void overwriteDummyParameters(Ptr graph, std::string weightsName); + void addDummyParameters(Ptr graph, ParamConvInfo paramInfo); + void overwriteDummyParameters(Ptr graph, ParamConvInfo paramInfo); /** * Computes a random rotation matrix for LSH hashing. diff --git a/src/layers/lsh_impl.h b/src/layers/lsh_impl.h new file mode 100644 index 000000000..d87d23e07 --- /dev/null +++ b/src/layers/lsh_impl.h @@ -0,0 +1,186 @@ +#pragma once + +#include + +#ifdef _MSC_VER +#define __builtin_popcountl __popcnt64 +#define __builtin_popcount __popcnt +#endif + +namespace marian { +namespace lsh { + + struct Parameters { + int k; + uint8_t* queryRows; + int numQueryRows; + uint8_t* codeRows; + int numCodeRows; + int bytesPerVector; + }; + + typedef uint32_t DistType; + typedef uint64_t ChunkType; + + inline DistType popcount(const ChunkType& chunk) { + switch (sizeof(ChunkType)) { + case 8 : return (DistType)__builtin_popcountl((uint64_t)chunk); + case 4 : return (DistType)__builtin_popcount((uint32_t)chunk); + default: ABORT("Size {} not supported", sizeof(ChunkType)); + } + } + + // return the number of full bytes required to encoded that many bits + inline int bytesPerVector(int nBits); + + // compute top-k hamming distances for given query and weight binary codes. Faster than FAISS version, especially for larger k nearly constant wrt. k. + template + inline constexpr T getStaticOrDynamic(T dynamicValue) { + return Dynamic ? dynamicValue : StaticValue; + } + + template + inline DistType hamming(ChunkType* queryRow, ChunkType* codeRow, int stepsDynamic = 0) { + static_assert(Dynamic == true || StepsStatic != 0, "Either define dynamic use of steps or provide non-zero template argument"); + DistType dist = 0; + for(int i = 0; i < getStaticOrDynamic(stepsDynamic); ++i) + dist += popcount(queryRow[i] ^ codeRow[i]); + return dist; + } + + template + inline void hammingTopKUnrollWarp(int queryOffset, const Parameters& parameters, const Functor& gather) { + const int numBits = getStaticOrDynamic(parameters.bytesPerVector) * 8; + ABORT_IF(numBits % 64 != 0, "LSH hash size has to be a multiple of 64"); + + // counter to keep track of seen hamming distances + std::vector> counter(warpSize, std::vector(numBits, 0)); + // buffer the distances for query vector warpRowId to all weight weight vectors codeRowId + std::vector> distBuffer(warpSize, std::vector(getStaticOrDynamic(parameters.numCodeRows), 0)); + // minimal distances per query + std::vector minDist(warpSize); + + constexpr int StepStatic = BytesPerVector / sizeof(ChunkType); + int stepDynamic = parameters.bytesPerVector / sizeof(ChunkType); + + ChunkType* codeRow = (ChunkType*)parameters.codeRows; + + for(int warpRowId = 0; warpRowId < warpSize; warpRowId++) { + std::memset(counter[warpRowId].data(), 0, numBits * sizeof(DistType)); // Reset the counter via memset to 0 + minDist[warpRowId] = (DistType)numBits; + } + + for(IndexType codeRowId = 0; codeRowId < (IndexType)getStaticOrDynamic(parameters.numCodeRows); ++codeRowId, codeRow += getStaticOrDynamic(stepDynamic)) { + ChunkType* queryRow = (ChunkType*)parameters.queryRows; + for(IndexType warpRowId = 0; warpRowId < warpSize; warpRowId++, queryRow += getStaticOrDynamic(stepDynamic)) { + // Compute the bit-wise hamming distance + DistType dist = hamming(queryRow, codeRow, stepDynamic); + + // Record the minimal distance seen for this query vector wrt. all weight vectors + if(dist < minDist[warpRowId]) { + minDist[warpRowId] = dist; + } + + // Record the number of weight vectors that have this distance from the query vector. + // Note, because there is at most numBits different distances this can be trivially done. + // Not the case for generic distances like float. + counter[warpRowId][dist]++; + + // Record the distance for this weight vector + distBuffer[warpRowId][codeRowId] = dist; + } + } + // warp finished, harvest k top distances + + for(int warpRowId = 0; warpRowId < warpSize; warpRowId++) { + // Here we search for the distance at which we have seen equal or more than k elements with + // smaller distances. We start with the minimal distance from above which is its own address + // to the counter. + DistType maxDist = minDist[warpRowId]; + size_t cummulativeDistances = 0; + + // Accumulate number of elements until we reach k in growing distance order. Note that + // counter is indexed by hamming distance - from lowest to highest. Some slots will be 0. + // The cumulative sum from position a to b tells you how many elements have distances smaller + // than the distance at b. + while(cummulativeDistances < parameters.k) + cummulativeDistances += counter[warpRowId][maxDist++]; + if(cummulativeDistances) + maxDist--; // fix overcounting + + // Usually, we overshoot by a couple of elements and we need to take care of the distance at which the k-th + // element sits. This elements has more neighbors at the same distance, but we only care for them + // as long we have not reached k elements in total. + // By contrast, we trivially collect all elements below that distance -- these are always safe. + + // This is the number of elements we need to collect at the last distance. + DistType maxDistLimit = /*number of elements at maxDist=*/counter[warpRowId][maxDist] - /*overflow=*/((DistType)cummulativeDistances - (DistType)parameters.k); + IndexType kSeen = 0; + IndexType kSeenAtKDist = 0; + + for(IndexType codeRowId = 0; kSeen < (IndexType)parameters.k && codeRowId < (IndexType)getStaticOrDynamic(parameters.numCodeRows); ++codeRowId) { + DistType dist = distBuffer[warpRowId][codeRowId]; + // - if the current distance is smaller than the maxDist, just consume. + // - if the distance is equal to maxDist, make sure to only consume maxDistLimit elements at maxDist + // and ignore the rest (smaller indices make it in first). + // - after we finish this loop we have exactly k top values for every query row in original index order. + int queryRowId = queryOffset + warpRowId; + if(dist < maxDist) { + gather(queryRowId, (IndexType)kSeen, codeRowId, dist); + kSeen++; + } else if(dist == maxDist && kSeenAtKDist < (DistType)maxDistLimit) { + gather(queryRowId, (IndexType)kSeen, codeRowId, dist); + kSeen++; + kSeenAtKDist++; + } + } + } + } + + // Faster top-k search for hamming distance. The idea here is that instead of sorting the elements we find a hamming distances at which it is safe + // to copy the given index. Copying only the indices below that distance is guaranteed to results in no more than k elements. For elements at that + // distance we need to correct for overshooting. + // Once we have that distance we only need to traverse the set of distances. In the end we get exactly k elements per queryRows vector. + template + inline void hammingTopKUnroll(const Parameters& parameters, const Functor& gather) { + static_assert(Dynamic == true || (NumCodeRows != 0 && BytesPerVector != 0), "Either define dynamic use of variables or provide non-zero template arguments"); + + int warpSize = 4; // starting warpSize of 4 seems optimal + auto warpParameters = parameters; + for(int queryOffset = 0; queryOffset < parameters.numQueryRows; queryOffset += warpSize) { + while(parameters.numQueryRows - queryOffset < warpSize) + warpSize /= 2; + + int step = getStaticOrDynamic(parameters.bytesPerVector); + warpParameters.queryRows = parameters.queryRows + queryOffset * step; + warpParameters.numQueryRows = warpSize; + switch(warpSize) { + case 8 : hammingTopKUnrollWarp<8, NumCodeRows, BytesPerVector, Dynamic>(queryOffset, warpParameters, gather); break; + case 4 : hammingTopKUnrollWarp<4, NumCodeRows, BytesPerVector, Dynamic>(queryOffset, warpParameters, gather); break; + case 2 : hammingTopKUnrollWarp<2, NumCodeRows, BytesPerVector, Dynamic>(queryOffset, warpParameters, gather); break; + case 1 : hammingTopKUnrollWarp<1, NumCodeRows, BytesPerVector, Dynamic>(queryOffset, warpParameters, gather); break; + default: ABORT("Unhandled warpSize = {}??", warpSize); + } + } + } + + template + inline void hammingTopK(const Parameters& parameters, const Functor& gather) { + if(parameters.numCodeRows == 2048 && parameters.bytesPerVector == 64) + hammingTopKUnroll< 2048, 64, false>(parameters, gather); + else if(parameters.numCodeRows == 4096 && parameters.bytesPerVector == 64) + hammingTopKUnroll< 4096, 64, false>(parameters, gather); + else if(parameters.numCodeRows == 6144 && parameters.bytesPerVector == 64) + hammingTopKUnroll< 6144, 64, false>(parameters, gather); + else if(parameters.numCodeRows == 8192 && parameters.bytesPerVector == 64) + hammingTopKUnroll< 8192, 64, false>(parameters, gather); + else if(parameters.numCodeRows == 32000 && parameters.bytesPerVector == 64) + hammingTopKUnroll<32000, 64, false>(parameters, gather); + else if(parameters.numCodeRows == 32000 && parameters.bytesPerVector == 128) + hammingTopKUnroll<32000, 128, false>(parameters, gather); + else + hammingTopKUnroll< 0, 0, true>(parameters, gather); + } + +} // namespace lsh +} // namespace marian \ No newline at end of file diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index a439197b7..316c66d11 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -178,8 +178,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { auto score = std::get<2>(result); // determine alignment if present AlignmentSets alignmentSets; - if (options_->hasAndNotEmpty("alignment")) - { + if (options_->hasAndNotEmpty("alignment")) { float alignmentThreshold; auto alignment = options_->get("alignment"); // @TODO: this logic now exists three times in Marian if (alignment == "soft") @@ -287,7 +286,7 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP // Add dummy parameters for the LSH before the model gets actually initialized. // This create the parameters with useless values in the tensors, but it gives us the memory we need. graph->setReloaded(false); - lsh::addDummyParameters(graph, /*weights=*/lshOutputWeights, /*nBits=*/lshNBits); + lsh::addDummyParameters(graph, /*paramInfo=*/{lshOutputWeights, "lsh_output_codes", "lsh_output_rotation", lshNBits}); graph->setReloaded(true); } @@ -296,7 +295,7 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP if(addLsh) { // After initialization, hijack the paramters for the LSH and force-overwrite with correct values. // Once this is done we can just pack and save as normal. - lsh::overwriteDummyParameters(graph, /*weights=*/lshOutputWeights); + lsh::overwriteDummyParameters(graph, /*paramInfo=*/{lshOutputWeights, "lsh_output_codes", "lsh_output_rotation", lshNBits}); } Type targetPrecType = (Type) targetPrec; From b3feecc82b55d235c2bffee2aecfafb8f9ead9bb Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 10 Feb 2022 16:34:23 +0000 Subject: [PATCH 155/254] Merged PR 22483: Make C++17 the official standard for Marian Make C++17 the official standard for Marian --- CHANGELOG.md | 2 ++ CMakeLists.txt | 7 +++-- VERSION | 2 +- azure-pipelines.yml | 48 ++--------------------------- scripts/ci/install_cuda_ubuntu.sh | 11 +++++-- src/3rd_party/half_float/umHalf.inl | 2 +- 6 files changed, 20 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea3935b39..abb2ecadb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Changed +- Changed minimal C++ standard to C++-17 + ## [1.11.0] - 2022-02-08 ### Added diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c41b365c..dbad75cb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ if (POLICY CMP0074) endif () project(marian CXX C) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.") @@ -91,10 +91,11 @@ if(MSVC) # C4310: cast truncates constant value # C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier # C4702: unreachable code; note it is also disabled globally in the VS project file + # C4996: warning STL4015: The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17 if(USE_SENTENCEPIECE) - set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4100\"") + set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\" /wd\"4100\"") else() - set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\"") + set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\" /wd\"4702\" /wd\"4996\"") endif() # set(INTRINSICS "/arch:AVX") diff --git a/VERSION b/VERSION index cd74ac3b5..65b4811df 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.0 +v1.11.1 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0348ebb42..f5e92400a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -192,6 +192,9 @@ stages: displayName: Ubuntu timeoutInMinutes: 90 + # Minimal tested configurations for marian-dev v1.11 and C++17: + # * Ubuntu 16.04, GCC 7.5, CMake 3.10.2, CUDA 9.2 (probably GCC 6 would work too) + # * Ubuntu 18.04, GCC 7.5, CMake 3.12.2, CUDA 10.0 strategy: matrix: ################################################################ @@ -319,51 +322,6 @@ stages: displayName: Print versions workingDirectory: build - ###################################################################### - - job: BuildUbuntuMinimal - condition: eq(${{ parameters.runBuilds }}, true) - displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5 - - pool: - vmImage: ubuntu-18.04 - - steps: - - checkout: self - submodules: true - - # The script simplifies installation of different versions of CUDA. - - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0" - displayName: Install CUDA - - # CMake 3.5.1 is the minimum version supported - - bash: | - wget -nv https://cmake.org/files/v3.5/cmake-3.5.1-Linux-x86_64.tar.gz - tar zxf cmake-3.5.1-Linux-x86_64.tar.gz - ./cmake-3.5.1-Linux-x86_64/bin/cmake --version - displayName: Download CMake - - # GCC 5 is the minimum version supported - - bash: | - /usr/bin/gcc-7 --version - mkdir -p build - cd build - CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \ - ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \ - -DCOMPILE_CPU=on \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 - displayName: Configure CMake - - - bash: make -j3 - displayName: Compile - workingDirectory: build - - - bash: | - ./marian --version - ./marian-decoder --version - ./marian-scorer --version - displayName: Print versions - workingDirectory: build - ###################################################################### - job: BuildMacOS condition: eq(${{ parameters.runBuilds }}, true) diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index b058294ae..de60a5b65 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -60,6 +60,13 @@ CUDA_PACKAGES_IN=( CUDA_PACKAGES="" for package in "${CUDA_PACKAGES_IN[@]}"; do + # @todo This is not perfect. Should probably provide a separate list for diff versions + # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y + if [[ "${package}" == "nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + package="compiler" + elif [[ "${package}" == "compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + package="nvcc" + fi # Build the full package name and append to the string. CUDA_PACKAGES+=" cuda-${package}-${CUDA_MAJOR}-${CUDA_MINOR}" done @@ -72,8 +79,8 @@ echo "CUDA_PACKAGES ${CUDA_PACKAGES}" PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin" PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/${PIN_FILENAME}" -APT_KEY_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub" -REPO_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/" +APT_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub" +REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/" echo "PIN_FILENAME ${PIN_FILENAME}" echo "PIN_URL ${PIN_URL}" diff --git a/src/3rd_party/half_float/umHalf.inl b/src/3rd_party/half_float/umHalf.inl index 3f5285a2e..257ba1c20 100644 --- a/src/3rd_party/half_float/umHalf.inl +++ b/src/3rd_party/half_float/umHalf.inl @@ -344,7 +344,7 @@ inline HalfFloat operator+ (HalfFloat one, HalfFloat two) // compute the difference between the two exponents. shifts with negative // numbers are undefined, thus we need two code paths - register int expDiff = one.IEEE.Exp - two.IEEE.Exp; + /*register*/ int expDiff = one.IEEE.Exp - two.IEEE.Exp; if (0 == expDiff) { From 3b21ff39c5814214d715215858898f05a30e5acb Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 10 Feb 2022 08:35:49 -0800 Subject: [PATCH 156/254] update VERSION and CHANGELOG --- CHANGELOG.md | 1 + VERSION | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abb2ecadb..a724d254c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Changed - Changed minimal C++ standard to C++-17 +- Faster LSH top-k search on CPU ## [1.11.0] - 2022-02-08 diff --git a/VERSION b/VERSION index 65b4811df..07fb54b5d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.1 +v1.11.2 From 4b51dcbd066b927e29e4007512c1f887ee1a350f Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 11 Feb 2022 13:50:47 +0000 Subject: [PATCH 157/254] Merged PR 22524: Optimize guided alignment training speed via sparse alignments - part 1 This replaces dense alignment storage and training with a sparse representation. Training speed with guided alignment matches now nearly normal training speed, regaining about 25% speed. This is no. 1 of 2 PRs. The next one will introduce a new guided-alignment training scheme with better alignment accuracy. --- CHANGELOG.md | 2 +- VERSION | 2 +- regression-tests | 2 +- src/common/config_parser.cpp | 2 +- src/data/alignment.cpp | 39 +++++++++++-- src/data/alignment.h | 21 +++++-- src/data/batch.h | 2 +- src/data/corpus.cpp | 13 ++--- src/data/corpus_base.cpp | 25 ++++---- src/data/corpus_base.h | 46 ++++++--------- src/examples/mnist/dataset.h | 2 +- src/graph/expression_operators.cpp | 2 + src/layers/guided_alignment.h | 93 ++++++++++++++---------------- 13 files changed, 138 insertions(+), 113 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a724d254c..ad4642f20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed ### Changed - +- Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU diff --git a/VERSION b/VERSION index 07fb54b5d..3d461ead6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.2 +v1.11.3 diff --git a/regression-tests b/regression-tests index 0716f4e01..d59f7ad85 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 0716f4e012d1e3f7543bffa8aecc97ce9c903e17 +Subproject commit d59f7ad85ecfdf4a788c095ac9fc1c447094e39e diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 837bee53f..0d9564953 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -510,7 +510,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "none"); cli.add("--guided-alignment-cost", "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)", - "mse"); + "ce"); cli.add("--guided-alignment-weight", "Weight for guided alignment cost", 0.1); diff --git a/src/data/alignment.cpp b/src/data/alignment.cpp index 928beb21f..3b7e0d666 100644 --- a/src/data/alignment.cpp +++ b/src/data/alignment.cpp @@ -2,6 +2,8 @@ #include "common/utils.h" #include +#include +#include namespace marian { namespace data { @@ -10,10 +12,11 @@ WordAlignment::WordAlignment() {} WordAlignment::WordAlignment(const std::vector& align) : data_(align) {} -WordAlignment::WordAlignment(const std::string& line) { +WordAlignment::WordAlignment(const std::string& line, size_t srcEosPos, size_t tgtEosPos) { std::vector atok = utils::splitAny(line, " -"); for(size_t i = 0; i < atok.size(); i += 2) - data_.emplace_back(Point{ (size_t)std::stoi(atok[i]), (size_t)std::stoi(atok[i + 1]), 1.f }); + data_.push_back(Point{ (size_t)std::stoi(atok[i]), (size_t)std::stoi(atok[i + 1]), 1.f }); + data_.push_back(Point{ srcEosPos, tgtEosPos, 1.f }); // add alignment point for both EOS symbols } void WordAlignment::sort() { @@ -22,6 +25,35 @@ void WordAlignment::sort() { }); } +void WordAlignment::normalize(bool reverse/*=false*/) { + std::vector counts; + counts.reserve(data_.size()); + + // reverse==false : normalize target word prob by number of source words + // reverse==true : normalize source word prob by number of target words + auto srcOrTgt = [](const Point& p, bool reverse) { + return reverse ? p.srcPos : p.tgtPos; + }; + + for(const auto& a : data_) { + size_t pos = srcOrTgt(a, reverse); + if(counts.size() <= pos) + counts.resize(pos + 1, 0); + counts[pos]++; + } + + // a.prob at this point is either 1 or normalized to a different value, + // but we just set it to 1 / count, so multiple calls result in re-normalization + // regardless of forward or reverse direction. We also set the remaining values to 1. + for(auto& a : data_) { + size_t pos = srcOrTgt(a, reverse); + if(counts[pos] > 1) + a.prob = 1.f / counts[pos]; + else + a.prob = 1.f; + } +} + std::string WordAlignment::toString() const { std::stringstream str; for(auto p = begin(); p != end(); ++p) { @@ -32,7 +64,7 @@ std::string WordAlignment::toString() const { return str.str(); } -WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft, +WordAlignment ConvertSoftAlignToHardAlign(const SoftAlignment& alignSoft, float threshold /*= 1.f*/) { WordAlignment align; // Alignments by maximum value @@ -58,7 +90,6 @@ WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft, } } } - // Sort alignment pairs in ascending order align.sort(); diff --git a/src/data/alignment.h b/src/data/alignment.h index 1c68bb39e..f27bea383 100644 --- a/src/data/alignment.h +++ b/src/data/alignment.h @@ -1,20 +1,22 @@ #pragma once #include +#include #include namespace marian { namespace data { class WordAlignment { - struct Point - { +public: + struct Point { size_t srcPos; size_t tgtPos; float prob; }; private: std::vector data_; + public: WordAlignment(); @@ -28,11 +30,14 @@ class WordAlignment { public: /** - * @brief Constructs word alignments from textual representation. + * @brief Constructs word alignments from textual representation. Adds alignment point for externally + * supplied EOS positions in source and target string. * * @param line String in the form of "0-0 1-1 1-2", etc. */ - WordAlignment(const std::string& line); + WordAlignment(const std::string& line, size_t srcEosPos, size_t tgtEosPos); + + Point& operator[](size_t i) { return data_[i]; } auto begin() const -> decltype(data_.begin()) { return data_.begin(); } auto end() const -> decltype(data_.end()) { return data_.end(); } @@ -46,6 +51,12 @@ class WordAlignment { */ void sort(); + /** + * @brief Normalizes alignment probabilities of target words to sum to 1 over source words alignments. + * This is needed for correct cost computation for guided alignment training with CE cost criterion. + */ + void normalize(bool reverse=false); + /** * @brief Returns textual representation. */ @@ -56,7 +67,7 @@ class WordAlignment { // Also used on QuickSAND boundary where beam and batch size is 1. Then it is simply [t][s] -> P(s|t) typedef std::vector> SoftAlignment; // [trg pos][beam depth * max src length * batch size] -WordAlignment ConvertSoftAlignToHardAlign(SoftAlignment alignSoft, +WordAlignment ConvertSoftAlignToHardAlign(const SoftAlignment& alignSoft, float threshold = 1.f); std::string SoftAlignToString(SoftAlignment align); diff --git a/src/data/batch.h b/src/data/batch.h index 3c592b315..761f46a4d 100644 --- a/src/data/batch.h +++ b/src/data/batch.h @@ -24,7 +24,7 @@ class Batch { const std::vector& getSentenceIds() const { return sentenceIds_; } void setSentenceIds(const std::vector& ids) { sentenceIds_ = ids; } - virtual void setGuidedAlignment(std::vector&&) = 0; + virtual void setGuidedAlignment(std::vector&&) = 0; virtual void setDataWeights(const std::vector&) = 0; virtual ~Batch() {}; protected: diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 643a7de93..2fbe49823 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -132,14 +132,13 @@ SentenceTuple Corpus::next() { tup.markAltered(); addWordsToSentenceTuple(fields[i], vocabId, tup); } - - // weights are added last to the sentence tuple, because this runs a validation that needs - // length of the target sequence - if(alignFileIdx_ > -1) - addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); - if(weightFileIdx_ > -1) - addWeightsToSentenceTuple(fields[weightFileIdx_], tup); } + // weights are added last to the sentence tuple, because this runs a validation that needs + // length of the target sequence + if(alignFileIdx_ > -1) + addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); + if(weightFileIdx_ > -1) + addWeightsToSentenceTuple(fields[weightFileIdx_], tup); // check if all streams are valid, that is, non-empty and no longer than maximum allowed length if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 636752c97..71c9f9908 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -429,11 +429,13 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { - ABORT_IF(rightLeft_, - "Guided alignment and right-left model cannot be used " - "together at the moment"); + ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used together at the moment"); + ABORT_IF(tup.size() != 2, "Using alignment between source and target, but sentence tuple has {} elements??", tup.size()); - auto align = WordAlignment(line); + size_t srcEosPos = tup[0].size() - 1; + size_t tgtEosPos = tup[1].size() - 1; + + auto align = WordAlignment(line, srcEosPos, tgtEosPos); tup.setAlignment(align); } @@ -457,22 +459,17 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl void CorpusBase::addAlignmentsToBatch(Ptr batch, const std::vector& batchVector) { - int srcWords = (int)batch->front()->batchWidth(); - int trgWords = (int)batch->back()->batchWidth(); + std::vector aligns; + int dimBatch = (int)batch->getSentenceIds().size(); - - std::vector aligns(srcWords * dimBatch * trgWords, 0.f); - + aligns.reserve(dimBatch); + for(int b = 0; b < dimBatch; ++b) { - // If the batch vector is altered within marian by, for example, case augmentation, // the guided alignments we received for this tuple cease to be valid. // Hence skip setting alignments for that sentence tuple.. if (!batchVector[b].isAltered()) { - for(auto p : batchVector[b].getAlignment()) { - size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos; - aligns[idx] = 1.f; - } + aligns.push_back(std::move(batchVector[b].getAlignment())); } } batch->setGuidedAlignment(std::move(aligns)); diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index a54c20f88..4e6d923ee 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -338,7 +338,7 @@ class SubBatch { class CorpusBatch : public Batch { protected: std::vector> subBatches_; - std::vector guidedAlignment_; // [max source len, batch size, max target len] flattened + std::vector guidedAlignment_; // [max source len, batch size, max target len] flattened std::vector dataWeights_; public: @@ -444,8 +444,17 @@ class CorpusBatch : public Batch { if(options->get("guided-alignment", std::string("none")) != "none") { // @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths - std::vector alignment(batchSize * lengths.front() * lengths.back(), - 0.f); + + std::vector alignment; + for(size_t k = 0; k < batchSize; ++k) { + data::WordAlignment perSentence; + // fill with random alignment points, add more twice the number of words to be safe. + for(size_t j = 0; j < lengths.back() * 2; ++j) { + size_t i = rand() % lengths.back(); + perSentence.push_back(i, j, 1.0f); + } + alignment.push_back(std::move(perSentence)); + } batch->setGuidedAlignment(std::move(alignment)); } @@ -501,29 +510,14 @@ class CorpusBatch : public Batch { } if(!guidedAlignment_.empty()) { - size_t oldTrgWords = back()->batchWidth(); - size_t oldSize = size(); - pos = 0; for(auto split : splits) { auto cb = std::static_pointer_cast(split); - size_t srcWords = cb->front()->batchWidth(); - size_t trgWords = cb->back()->batchWidth(); size_t dimBatch = cb->size(); - - std::vector aligns(srcWords * dimBatch * trgWords, 0.f); - - for(size_t i = 0; i < dimBatch; ++i) { - size_t bi = i + pos; - for(size_t sid = 0; sid < srcWords; ++sid) { - for(size_t tid = 0; tid < trgWords; ++tid) { - size_t bidx = sid * oldSize * oldTrgWords + bi * oldTrgWords + tid; // [sid, bi, tid] - size_t idx = sid * dimBatch * trgWords + i * trgWords + tid; - aligns[idx] = guidedAlignment_[bidx]; - } - } - } - cb->setGuidedAlignment(std::move(aligns)); + std::vector batchAlignment; + for(size_t i = 0; i < dimBatch; ++i) + batchAlignment.push_back(std::move(guidedAlignment_[i + pos])); + cb->setGuidedAlignment(std::move(batchAlignment)); pos += dimBatch; } } @@ -556,15 +550,11 @@ class CorpusBatch : public Batch { return splits; } - const std::vector& getGuidedAlignment() const { return guidedAlignment_; } // [dimSrcWords, dimBatch, dimTrgWords] flattened - void setGuidedAlignment(std::vector&& aln) override { + const std::vector& getGuidedAlignment() const { return guidedAlignment_; } // [dimSrcWords, dimBatch, dimTrgWords] flattened + void setGuidedAlignment(std::vector&& aln) override { guidedAlignment_ = std::move(aln); } - size_t locateInGuidedAlignments(size_t b, size_t s, size_t t) { - return ((s * size()) + b) * widthTrg() + t; - } - std::vector& getDataWeights() { return dataWeights_; } void setDataWeights(const std::vector& weights) override { dataWeights_ = weights; diff --git a/src/examples/mnist/dataset.h b/src/examples/mnist/dataset.h index b0492b852..c665fa655 100644 --- a/src/examples/mnist/dataset.h +++ b/src/examples/mnist/dataset.h @@ -77,7 +77,7 @@ class DataBatch : public Batch { size_t size() const override { return inputs_.front().shape()[0]; } - void setGuidedAlignment(std::vector&&) override { + void setGuidedAlignment(std::vector&&) override { ABORT("Guided alignment in DataBatch is not implemented"); } void setDataWeights(const std::vector&) override { diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 5294fca3f..ca5e68054 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -286,6 +286,8 @@ Expr operator/(float a, Expr b) { /*********************************************************/ Expr concatenate(const std::vector& concats, int ax) { + if(concats.size() == 1) + return concats[0]; return Expression(concats, ax); } diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h index f08d3f092..d2171c508 100644 --- a/src/layers/guided_alignment.h +++ b/src/layers/guided_alignment.h @@ -5,62 +5,57 @@ namespace marian { -static inline RationalLoss guidedAlignmentCost(Ptr /*graph*/, +static inline const std::tuple, std::vector> +guidedAlignmentToSparse(Ptr batch) { + int trgWords = (int)batch->back()->batchWidth(); + int dimBatch = (int)batch->size(); + + typedef std::tuple BiPoint; + std::vector byIndex; + byIndex.reserve(dimBatch * trgWords); + + for(size_t b = 0; b < dimBatch; ++b) { + auto guidedAlignmentFwd = batch->getGuidedAlignment()[b]; // this copies + guidedAlignmentFwd.normalize(/*reverse=*/false); // normalize forward + for(size_t i = 0; i < guidedAlignmentFwd.size(); ++i) { + auto pFwd = guidedAlignmentFwd[i]; + IndexType idx = (IndexType)(pFwd.srcPos * dimBatch * trgWords + b * trgWords + pFwd.tgtPos); + byIndex.push_back({idx, pFwd.prob}); + } + } + + std::sort(byIndex.begin(), byIndex.end(), [](const BiPoint& a, const BiPoint& b) { return std::get<0>(a) < std::get<0>(b); }); + std::vector indices; std::vector valuesFwd; + indices.reserve(byIndex.size()); valuesFwd.reserve(byIndex.size()); + for(auto& p : byIndex) { + indices.push_back((IndexType)std::get<0>(p)); + valuesFwd.push_back(std::get<1>(p)); + } + + return {indices, valuesFwd}; +} + +static inline RationalLoss guidedAlignmentCost(Ptr graph, Ptr batch, Ptr options, Expr attention) { // [beam depth=1, max src length, batch size, tgt length] - std::string guidedLossType = options->get("guided-alignment-cost"); // @TODO: change "cost" to "loss" + + // We dropped support for other losses which are not possible to implement with sparse labels. + // They were most likely not used anyway. + ABORT_IF(guidedLossType != "ce", "Only alignment loss type 'ce' is supported"); + float guidedLossWeight = options->get("guided-alignment-weight"); - const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length] - float epsilon = 1e-6f; - Expr alignmentLoss; // sum up loss over all attention/alignment positions - size_t numLabels; - if(guidedLossType == "ce") { - // normalizedAlignment is multi-hot, but ce requires normalized probabilities, so need to normalize to P(s|t) - auto dimBatch = shape[-2]; - auto dimTrgWords = shape[-1]; - auto dimSrcWords = shape[-3]; - ABORT_IF(shape[-4] != 1, "Guided alignments with beam??"); - auto normalizedAlignment = batch->getGuidedAlignment(); // [dimSrcWords, dimBatch, dimTrgWords] flattened, matches shape of 'attention' - auto srcBatch = batch->front(); - const auto& srcMask = srcBatch->mask(); - ABORT_IF(shape.elements() != normalizedAlignment.size(), "Attention-matrix and alignment shapes differ??"); - ABORT_IF(dimBatch != batch->size() || dimTrgWords != batch->widthTrg() || dimSrcWords != batch->width(), "Attention-matrix and batch shapes differ??"); - auto locate = [=](size_t s, size_t b, size_t t) { return ((s * dimBatch) + b) * dimTrgWords + t; }; - for (size_t b = 0; b < dimBatch; b++) { - for (size_t t = 0; t < dimTrgWords; t++) { - for (size_t s = 0; s < dimSrcWords; s++) - ABORT_IF(locate(s, b, t) != batch->locateInGuidedAlignments(b, s, t), "locate() and locateInGuidedAlignments() differ??"); - // renormalize the alignment such that it sums up to 1 - float sum = 0; - for (size_t s = 0; s < dimSrcWords; s++) - sum += srcMask[srcBatch->locate(b, s)] * normalizedAlignment[locate(s, b, t)]; // these values are 0 or 1 - if (sum != 0 && sum != 1) - for (size_t s = 0; s < dimSrcWords; s++) - normalizedAlignment[locate(s, b, t)] /= sum; - } - } - auto alignment = constant_like(attention, std::move(normalizedAlignment)); - alignmentLoss = -sum(flatten(alignment * log(attention + epsilon))); - numLabels = batch->back()->batchWords(); - ABORT_IF(numLabels > shape.elements() / shape[-3], "Num labels of guided alignment cost is off??"); - } else { - auto alignment = constant_like(attention, batch->getGuidedAlignment()); - if(guidedLossType == "mse") - alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f; - else if(guidedLossType == "mult") // @TODO: I don't know what this criterion is for. Can we remove it? - alignmentLoss = -log(sum(flatten(attention * alignment)) + epsilon); - else - ABORT("Unknown alignment cost type: {}", guidedLossType); - // every position is a label as they should all agree - // @TODO: there should be positional masking here ... on the other hand, positions that are not - // in a sentence should always agree (both being 0). Lack of masking affects label count only which is - // probably negligible? - numLabels = shape.elements(); - } + auto [indices, values] = guidedAlignmentToSparse(batch); + auto alignmentIndices = graph->indices(indices); + auto alignmentValues = graph->constant({(int)values.size()}, inits::fromVector(values)); + auto attentionAtAligned = cols(flatten(attention), alignmentIndices); + float epsilon = 1e-6f; + Expr alignmentLoss = -sum(alignmentValues * log(attentionAtAligned + epsilon)); + size_t numLabels = alignmentIndices->shape().elements(); + // Create label node, also weigh by scalar so labels and cost are in the same domain. // Fractional label counts are OK. But only if combined as "sum". // @TODO: It is ugly to check the multi-loss type here, but doing this right requires From b8bf086b109e92a4f304750a14cf99c1848b86a0 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 11 Feb 2022 06:04:38 -0800 Subject: [PATCH 158/254] move regression-tests pointer --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index d59f7ad85..da95717d4 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit d59f7ad85ecfdf4a788c095ac9fc1c447094e39e +Subproject commit da95717d416234859527af77960acca366d58d5c From 8a9580b3293d7ffa475d72851d9c962fb1aea77e Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 15 Feb 2022 11:18:29 +0000 Subject: [PATCH 159/254] update the intgemm version to upstream (#924) Some data types got upper cased, that's why there is a larger diff than expected Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + VERSION | 2 +- src/3rd_party/intgemm | 2 +- src/tensors/cpu/expression_graph_packable.h | 12 +++++----- src/tensors/cpu/integer_common.h | 26 ++++++++++----------- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e2a40d57..6a1dabf92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU +- Updated intgemm to the latest upstream version ## [1.11.0] - 2022-02-08 diff --git a/VERSION b/VERSION index 3d461ead6..f5f1545d8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.3 +v1.11.4 diff --git a/src/3rd_party/intgemm b/src/3rd_party/intgemm index 8abde25b1..a05a2e51a 160000 --- a/src/3rd_party/intgemm +++ b/src/3rd_party/intgemm @@ -1 +1 @@ -Subproject commit 8abde25b13c3ab210c0dec8e23f4944e3953812d +Subproject commit a05a2e51ab524bcee954a39ee72005193f3adf7c diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index f5a9cad9c..1a233372c 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -172,19 +172,19 @@ class ExpressionGraphPackable : public ExpressionGraph { // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type if(isSsse3(gemmElementType)) { - intgemm::ssse3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::SSSE3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx2(gemmElementType)) { - intgemm::avx2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::AVX2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx512(gemmElementType)) { - intgemm::avx512bw::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::AVX512BW::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), @@ -206,19 +206,19 @@ class ExpressionGraphPackable : public ExpressionGraph { // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type if(isSse2(gemmElementType)) { - intgemm::sse2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::SSE2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx2(gemmElementType)) { - intgemm::avx2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::AVX2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx512(gemmElementType)) { - intgemm::avx512bw::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + intgemm::AVX512BW::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index cb372a745..f4e632b5c 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -11,21 +11,21 @@ namespace intgemm { struct Int8; struct Int16; - namespace ssse3 { + namespace SSSE3 { struct Kernels8; } - namespace sse2 { + namespace SSE2 { struct Kernels16; } - namespace avx2 { + namespace AVX2 { struct Kernels8; struct Kernels16; } - namespace avx512bw { + namespace AVX512BW { struct Kernels8; struct Kernels16; } - namespace avx512vnni { + namespace AVX512VNNI { struct Kernels8; } } @@ -57,22 +57,22 @@ template <> struct intgemm_ { }; template <> struct intgemm_ { - using width = intgemm::ssse3::Kernels8; + using width = intgemm::SSSE3::Kernels8; using type = int8_t; }; template <> struct intgemm_ { - using width = intgemm::avx2::Kernels8; + using width = intgemm::AVX2::Kernels8; using type = int8_t; }; template <> struct intgemm_ { - using width = intgemm::avx512bw::Kernels8; + using width = intgemm::AVX512BW::Kernels8; using type = int8_t; }; template <> struct intgemm_ { - using width = intgemm::avx512vnni::Kernels8; + using width = intgemm::AVX512VNNI::Kernels8; using type = int8_t; }; @@ -82,17 +82,17 @@ template <> struct intgemm_ { }; template <> struct intgemm_ { - using width = intgemm::sse2::Kernels16; + using width = intgemm::SSE2::Kernels16; using type = int16_t; }; template <> struct intgemm_ { - using width = intgemm::avx2::Kernels16; + using width = intgemm::AVX2::Kernels16; using type = int16_t; }; template <> struct intgemm_ { - using width = intgemm::avx512bw::Kernels16; + using width = intgemm::AVX512BW::Kernels16; using type = int16_t; }; @@ -220,4 +220,4 @@ void prepareAndTransposeB(io::Item& item, const char * input) { } //integer } //cpu -} //marian \ No newline at end of file +} //marian From 58c4576e5d4e49735ea65c44aa41d4f8e1713e97 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Feb 2022 11:21:14 +0000 Subject: [PATCH 160/254] Bump regression-tests from `da95717` to `88e6382` (#923) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `da95717` to `88e6382`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/da95717d416234859527af77960acca366d58d5c...88e638224129e221366023107d4bd3a72ab65297) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index da95717d4..88e638224 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit da95717d416234859527af77960acca366d58d5c +Subproject commit 88e638224129e221366023107d4bd3a72ab65297 From 601c9ac9807b5ffcbed298952435d9a17d954575 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Tue, 15 Feb 2022 13:22:49 +0000 Subject: [PATCH 161/254] Detect fortran_order in npz (#911) * Fix fortran_order parsing * Abort on non row-major NPZ entries * Update CHANGELOG * Update VERSION Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 4 +++- VERSION | 2 +- src/3rd_party/cnpy/cnpy.cpp | 2 +- src/common/io.cpp | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a1dabf92..721ffd06a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,12 +12,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load +- Fixed check for `fortran_ordering` in cnpy ### Changed -- Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. +- Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU - Updated intgemm to the latest upstream version +- Parameters in npz files are no longer implicitly assumed to be row-ordered. Non row-ordered parameters will result in an abort ## [1.11.0] - 2022-02-08 diff --git a/VERSION b/VERSION index f5f1545d8..62e1a5028 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.4 +v1.11.5 diff --git a/src/3rd_party/cnpy/cnpy.cpp b/src/3rd_party/cnpy/cnpy.cpp index 7ab102f91..1bda4f483 100644 --- a/src/3rd_party/cnpy/cnpy.cpp +++ b/src/3rd_party/cnpy/cnpy.cpp @@ -82,7 +82,7 @@ void cnpy::parse_npy_header(FILE* fp, char& type, unsigned int& word_size, unsig //fortran order loc1 = (int)header.find("fortran_order")+16; - fortran_order = (header.substr(loc1,5) == "True" ? true : false); + fortran_order = (header.substr(loc1,4) == "True" ? true : false); //shape loc1 = (int)header.find("("); diff --git a/src/common/io.cpp b/src/common/io.cpp index e0b3f39a5..6a7be6a36 100644 --- a/src/common/io.cpp +++ b/src/common/io.cpp @@ -90,6 +90,9 @@ void addMetaToItems(const std::string& meta, void loadItemsFromNpz(const std::string& fileName, std::vector& items) { auto numpy = cnpy::npz_load(fileName); for(auto it : numpy) { + ABORT_IF( + it.second->fortran_order, "Numpy item '{}' is not stored in row-major order", it.first); + Shape shape; shape.resize(it.second->shape.size()); for(size_t i = 0; i < it.second->shape.size(); ++i) From adaaf087e4c0804f47c85f0599301f5fd6dda2c2 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 16 Feb 2022 13:20:48 -0800 Subject: [PATCH 162/254] better error message --- src/graph/expression_graph.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 7e2a57040..e3222a0fb 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -637,7 +637,7 @@ class ExpressionGraph : public std::enable_shared_from_this { ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created"); // Safeguard against accessing parameters from the outside with multiple parameter types, not yet supported - ABORT_IF(paramsByElementType_.size() > 1, "Calling of params() is currently not supported with multiple ({}) parameters", paramsByElementType_.size()); + ABORT_IF(paramsByElementType_.size() > 1, "Calling of params() is currently not supported with multiple ({}) parameter types (did you try to access a quantized model?)", paramsByElementType_.size()); // Safeguard against accessing parameters from the outside with other than default parameter type, not yet supported auto it = paramsByElementType_.find(defaultElementType_); From 310d2f42f68e605666961abfa843c4f7a5697078 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 7 Mar 2022 16:57:32 +0000 Subject: [PATCH 163/254] Merged PR 22939: Fix case augmentation with multi-threaded reading This PR fixes case augmentation with multi-threaded reading. The solution is to not look at iterator::pos_ in lazy processing, rather pass it as an argument to the lazy function. --- CHANGELOG.md | 1 + VERSION | 2 +- src/data/corpus.cpp | 8 ++++---- src/data/corpus.h | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e2a40d57..5cb28e123 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed diff --git a/VERSION b/VERSION index 3d461ead6..f5f1545d8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.3 +v1.11.4 diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 2fbe49823..835d9d76c 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -39,10 +39,10 @@ Corpus::Corpus(std::vector paths, } -void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { +void Corpus::preprocessLine(std::string& line, size_t streamId, size_t lineId, bool& altered) { bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; altered = false; - if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) { + if (allCapsEvery_ != 0 && lineId % allCapsEvery_ == 0 && !inference_) { line = vocabs_[streamId]->toUpper(line); if (streamId == 0) LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line); @@ -50,7 +50,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line); altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps } - else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) { + else if (titleCaseEvery_ != 0 && lineId % titleCaseEvery_ == 1 && !inference_ && streamId == 0) { // Only applied to stream 0 (source) since this feature is aimed at robustness against // title case in the source (and not at translating into title case). // Note: It is user's responsibility to not enable this if the source language is not English. @@ -127,7 +127,7 @@ SentenceTuple Corpus::next() { } else { size_t vocabId = i - shift; bool altered; - preprocessLine(fields[i], vocabId, /*out=*/altered); + preprocessLine(fields[i], vocabId, curId, /*out=*/altered); if (altered) tup.markAltered(); addWordsToSentenceTuple(fields[i], vocabId, tup); diff --git a/src/data/corpus.h b/src/data/corpus.h index 281d43a22..20200e93f 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -33,7 +33,7 @@ class Corpus : public CorpusBase { // for pre-processing size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target) size_t titleCaseEvery_{0}; // ditto for title case (source only) - void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian + void preprocessLine(std::string& line, size_t streamId, size_t curId, bool& altered); // altered => whether the segmentation was altered in marian public: // @TODO: check if translate can be replaced by an option in options From 16bfa0c913959f44c65deb495ae8dbecd175d85f Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 16 Mar 2022 14:44:17 +0000 Subject: [PATCH 164/254] Merged PR 23094: Adapt --cost-scaling to more stable setting This PR sets default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable than variable cost-scaling with larger numbers that was the default before. --- CHANGELOG.md | 1 + VERSION | 2 +- src/common/aliases.cpp | 2 +- src/common/config_parser.cpp | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cb28e123..1d2b4338d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed +- Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU diff --git a/VERSION b/VERSION index f5f1545d8..62e1a5028 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.4 +v1.11.5 diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index b38ccc648..3db31e515 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -32,7 +32,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { if(mode_ == cli::mode::training) { config["precision"] = std::vector({"float16", "float32"}); // inference type, optimization type, save type // scaling factor, frequency, multiplier at increase, minium scaling factor - config["cost-scaling"] = std::vector({"256.f", "1000", "2.f", "256.f"}); + config["cost-scaling"] = std::vector({"8.f", "10000", "1.f", "8.f"}); } else { config["precision"] = std::vector({"float16"}); // for inference we do not need the other types } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 0d9564953..e3ac21088 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -534,7 +534,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " - "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); + "corresponds to: --precision float16 float32 --cost-scaling 8.f 10000 1.f 8.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", @@ -542,7 +542,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") - ->implicit_val("256.f 1000 2.f 256.f"); + ->implicit_val("8.f 10000 1.f 8.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", From c809843f14a5030d2ecbe9bd2be48e0bf9431134 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Mar 2022 08:38:30 +0000 Subject: [PATCH 165/254] Bump examples from `6d5921c` to `29f4f7c` (#928) Bumps [examples](https://github.com/marian-nmt/marian-examples) from `6d5921c` to `29f4f7c`. - [Release notes](https://github.com/marian-nmt/marian-examples/releases) - [Commits](https://github.com/marian-nmt/marian-examples/compare/6d5921cc7de91f4e915b59e9c52c9a76c4e99b00...29f4f7c380c860a95b9375813f4b199b2e6b5556) --- updated-dependencies: - dependency-name: examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 6d5921cc7..29f4f7c38 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 6d5921cc7de91f4e915b59e9c52c9a76c4e99b00 +Subproject commit 29f4f7c380c860a95b9375813f4b199b2e6b5556 From 75a7a1dfd2b2aad7eb35369e6f56f11351f1d9f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Mar 2022 08:40:11 +0000 Subject: [PATCH 166/254] Bump regression-tests from `88e6382` to `4fa9ff5` (#929) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `88e6382` to `4fa9ff5`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/88e638224129e221366023107d4bd3a72ab65297...4fa9ff55af68bc87d8bd04c9b410f1e1d3874718) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 88e638224..4fa9ff55a 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 88e638224129e221366023107d4bd3a72ab65297 +Subproject commit 4fa9ff55af68bc87d8bd04c9b410f1e1d3874718 From 78bef7aeba8c281205e334045897ebe5f0136784 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 22 Mar 2022 10:06:11 +0000 Subject: [PATCH 167/254] Bump src/3rd_party/sentencepiece from `c307b87` to `5312a30` (#927) Bumps [src/3rd_party/sentencepiece](https://github.com/marian-nmt/sentencepiece) from `c307b87` to `5312a30`. - [Release notes](https://github.com/marian-nmt/sentencepiece/releases) - [Commits](https://github.com/marian-nmt/sentencepiece/compare/c307b874deb5ea896db8f93506e173353e66d4d3...5312a306c4c0a458e29a8882ebfb42a179aaf580) --- updated-dependencies: - dependency-name: src/3rd_party/sentencepiece dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index c307b874d..5312a306c 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3 +Subproject commit 5312a306c4c0a458e29a8882ebfb42a179aaf580 From 23c36ec1a3c71cc75bc49fd3e39a4b1d8636589d Mon Sep 17 00:00:00 2001 From: Artur Nowakowski Date: Tue, 22 Mar 2022 11:07:41 +0100 Subject: [PATCH 168/254] Fixed fp16 training/inference with factors-combine concat (#926) --- CHANGELOG.md | 1 + src/layers/embedding.cpp | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 721ffd06a..fe2f91ee1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load - Fixed check for `fortran_ordering` in cnpy +- Fixed fp16 training/inference with factors-combine concat method ### Changed - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index 26d6b7fe3..d6768fdbf 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -57,8 +57,7 @@ Embedding::Embedding(Ptr graph, Ptr options) auto lemmaEmbs = rows(E_, lemmaIndices); int dimFactors = FactorEmbMatrix_->shape()[0]; auto factEmbs - = dot(graph->constant( - {(int)data.size(), dimFactors}, inits::fromVector(factorIndices), Type::float32), + = dot(graph->constant({(int)data.size(), dimFactors}, inits::fromVector(factorIndices)), FactorEmbMatrix_); return concatenate({lemmaEmbs, factEmbs}, -1); From d5c7372a67a6baf1df58ae5ef7240372cedf73c1 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 8 Apr 2022 16:00:04 +0000 Subject: [PATCH 169/254] Merged PR 23407: Fix incorrect/missing gradient accumulation for affine biases This PR fixes incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. --- CHANGELOG.md | 1 + VERSION | 2 +- src/graph/node_operators_binary.h | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2b4338d..681fa59a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. - Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load diff --git a/VERSION b/VERSION index 62e1a5028..a130ad69a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.5 +v1.11.6 diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index b2a646b1c..f46e0b899 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -334,7 +334,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; if(transA_ && !transB_) @@ -353,7 +353,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; if(transA_ && transB_) @@ -372,7 +372,7 @@ class AffineNodeOp : public NaryNodeOp { true, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; return { @@ -390,7 +390,7 @@ class AffineNodeOp : public NaryNodeOp { false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; } From 1e4e1014eded1d9d82e3f4becba177ac0cd390f5 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 8 Apr 2022 17:15:56 +0000 Subject: [PATCH 170/254] Merged PR 23415: Set Windows image back to windows-2019 This should resolve latest issues with Windows checks. --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f5e92400a..192f0c871 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -34,7 +34,7 @@ variables: - name: MKL_URL value: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" - name: VCPKG_COMMIT - value: c69096659f49e2b1aca532ea5c2f8c135182519b + value: 2022.03.10 - name: VCPKG_DIR value: "$(Build.SourcesDirectory)/vcpkg" - name: VCPKG_PACKAGES @@ -67,7 +67,7 @@ stages: cuda_version: 10.2 pool: - vmImage: windows-latest + vmImage: windows-2019 steps: - checkout: self @@ -438,7 +438,7 @@ stages: displayName: Windows CPU+FBGEMM pool: - vmImage: windows-latest + vmImage: windows-2019 steps: # Due to multiple checkouts this will be commonly cloned into D:\a\1\s\marian-dev From 1a7435827716ff125ce8dcee462bb392110f8a0b Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 11 Apr 2022 20:19:58 +0000 Subject: [PATCH 171/254] Merged PR 23429: Small fixes around fp16 training and batch fitting This PR introduces small fixes around fp16 training and batch fitting: * Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) * Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) * During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. * Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. --- CHANGELOG.md | 4 ++++ VERSION | 2 +- src/common/config_parser.cpp | 4 ++-- src/common/shape.h | 28 +++++++++++++++++++++++++--- src/embedder/embedder.h | 2 +- src/graph/expression_graph.cpp | 17 +++++++++++++++++ src/graph/expression_graph.h | 9 ++++----- src/layers/guided_alignment.h | 4 ++-- src/rescorer/rescorer.h | 2 +- src/tensors/backend.h | 1 + src/tensors/cpu/backend.h | 4 ++++ src/tensors/gpu/backend.h | 6 ++++++ src/training/graph_group.cpp | 27 ++++++++++++++++++++++----- src/translator/translator.h | 4 ++-- 14 files changed, 92 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 681fa59a6..db2f658a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) +- Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) +- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. - Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. - Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed +- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. - Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 diff --git a/VERSION b/VERSION index a130ad69a..77418c859 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.6 +v1.11.7 diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index e3ac21088..404b43f1e 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -118,8 +118,8 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { ->implicit_val("basic"); cli.add>("--config,-c", "Configuration file(s). If multiple, later overrides earlier"); - cli.add("--workspace,-w", - "Preallocate arg MB of work space", + cli.add("--workspace,-w", + "Preallocate arg MB of work space. Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes.", defaultWorkspace); cli.add("--log", "Log training process information to file given by arg"); diff --git a/src/common/shape.h b/src/common/shape.h index 59e6cf21f..270b35376 100644 --- a/src/common/shape.h +++ b/src/common/shape.h @@ -12,6 +12,26 @@ namespace marian { +class ShapeSizeException : public std::exception { +private: + char* message_; + +public: + ShapeSizeException(size_t available, size_t asked) { + std::string mstr = "Expanded shape size " + std::to_string(asked) + + " exceeds numeric capcacity " + std::to_string(available); + + message_ = new char[mstr.size() + 1]; + std::copy(mstr.begin(), mstr.end(), message_); + message_[mstr.size()] = 0; + } + + ~ShapeSizeException() { delete[] message_; } + + virtual const char* what() const noexcept override { return message_; } +}; + + struct Slice // Python-like slice/index descriptor { Slice(int b, int e, int s) : begin(b), end(e), stride(s) {} @@ -110,10 +130,12 @@ struct Shape { template // using a template so that FactoredSegmenter, which uses this as well, can pass size_t inline T elements() const { - T el = 1; + size_t el = 1; for(auto s : shape_) - el *= (T)s; - return el; + el *= (size_t)s; + if(el > std::numeric_limits::max()) + throw ShapeSizeException(std::numeric_limits::max(), el); + return (T)el; } inline void dims(int i, std::vector& d) const { diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h index f2e4a10c0..36b3df444 100644 --- a/src/embedder/embedder.h +++ b/src/embedder/embedder.h @@ -84,7 +84,7 @@ class Embed : public ModelTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); - graph->reserveWorkspaceMB(options_->get("workspace")); + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); } diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 12a1195e1..146f7c4ca 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -23,6 +23,23 @@ void ExpressionGraph::setDevice(DeviceId deviceId, Ptr device) { } } +void ExpressionGraph::reserveWorkspaceMB(int num) { + size_t bytes; + if(num > 0) { + bytes = (size_t)num * 1024 * 1024 - 1; + } else if (num < 0) { + ABORT_IF(getDeviceId().type == DeviceType::cpu, "Negative workspace not allowed on CPU device"); + size_t globalMemorySize = backend_->getGlobalMemorySize(); // in bytes, only implemented for GPU backend + size_t notWorkspaceSize = (size_t)std::abs(num) * 1024 * 1024 - 1; + ABORT_IF(notWorkspaceSize >= globalMemorySize, "Negative workspace {} larger/equal total memory {}?", notWorkspaceSize, globalMemorySize); + bytes = globalMemorySize - notWorkspaceSize; + LOG(debug, "Reserving {} = {} - {} bytes as workspace", bytes, globalMemorySize, notWorkspaceSize); + } else { + ABORT("Allocating 0 bytes?"); + } + tensors_->reserve(bytes); +} + Expr ExpressionGraph::add(Expr node) { auto found = tensors_->findOrRemember(node); if(found) { diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index e3222a0fb..9272e42a3 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -244,11 +244,10 @@ class ExpressionGraph : public std::enable_shared_from_this { * Preallocate workspace memory (MB) for the graph. * Sets the size of the memory available for the forward and backward step of the training procedure. * This does not include model size and optimizer parameters that are allocated outsize workspace. + * If memory is negative (<0) the total available GPU memory is used with the absolute value substracted. + * Negative workspace is not supported on CPU. */ - void reserveWorkspaceMB(size_t num) { - size_t bytes = num * 1024 * 1024 - 1; - tensors_->reserve(bytes); - } + void reserveWorkspaceMB(int num); /** Copy tensor objects from one graph to current graph */ void reuseWorkspace(Ptr graph) { @@ -277,7 +276,7 @@ class ExpressionGraph : public std::enable_shared_from_this { tensors_->throwAtReallocation(true); backprop(); tensors_->throwAtReallocation(false); - } catch(AllocationException&) { + } catch(const AllocationException&) { tensors_->throwAtReallocation(false); return false; } diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h index d2171c508..d5929a6d6 100644 --- a/src/layers/guided_alignment.h +++ b/src/layers/guided_alignment.h @@ -53,9 +53,9 @@ static inline RationalLoss guidedAlignmentCost(Ptr graph, auto attentionAtAligned = cols(flatten(attention), alignmentIndices); float epsilon = 1e-6f; - Expr alignmentLoss = -sum(alignmentValues * log(attentionAtAligned + epsilon)); + Expr alignmentLoss = -sum(cast(alignmentValues * log(attentionAtAligned + epsilon), Type::float32)); size_t numLabels = alignmentIndices->shape().elements(); - + // Create label node, also weigh by scalar so labels and cost are in the same domain. // Fractional label counts are OK. But only if combined as "sum". // @TODO: It is ugly to check the multi-loss type here, but doing this right requires diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index af0a16066..26d74917e 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -73,7 +73,7 @@ class Rescore : public ModelTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); - graph->reserveWorkspaceMB(options_->get("workspace")); + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); } diff --git a/src/tensors/backend.h b/src/tensors/backend.h index e0e93039e..64a28f925 100644 --- a/src/tensors/backend.h +++ b/src/tensors/backend.h @@ -29,6 +29,7 @@ class Backend { // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name. virtual void setDevice() = 0; virtual void synchronize() = 0; + virtual size_t getGlobalMemorySize() = 0; // for CPU, sets to use optimized code for inference. // for GPU, this is invalid. for gpu, isOptimized() function always returns false. diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index f52ff6a33..76c47a79d 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -20,6 +20,10 @@ class Backend : public marian::Backend { void setDevice() override {} void synchronize() override {} + size_t getGlobalMemorySize() override { + ABORT("Not implemented on CPU"); + } + // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU. void setOptimized(bool optimize) override { optimized_ = optimize; } bool isOptimized() override { return optimized_; } diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 410b41a49..022e4f3fe 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -96,6 +96,12 @@ class Backend : public marian::Backend { CudaCompute getCudaComputeCapability() { return compute_; } + size_t getGlobalMemorySize() override { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, (int)deviceId_.no)); + return prop.totalGlobalMem; + } + private: cublasHandle_t cublasHandle_{0}; // make sure it's 0, so it can be initalized lazily cusparseHandle_t cusparseHandle_{0}; // as above diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 59cd4b6d8..4d92b1c9c 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -82,7 +82,7 @@ void GraphGroup::initGraphsAndOpts() { graph->setDevice(device); - graph->reserveWorkspaceMB(options_->get("workspace")); + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); @@ -510,8 +510,18 @@ Ptr GraphGroup::collectStats(Ptr graph, lengths[j] = std::min(lengths[j], localMaxes[j]); auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, maxBatch, options_); - auto loss = model->build(graph, batch); - fits = graph->fits(); + + // We check for a ShapeSizeException (happens if total shape size would exceed max int). + // If caught, we reduce the batch size. In any other context, this exception will cause + // an error and exit Marian. + try { + auto loss = model->build(graph, batch); + fits = graph->fits(); + } catch(const ShapeSizeException& e) { + LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what()); + fits = false; + } + if(fits) maxBatch *= 2; } @@ -530,8 +540,15 @@ Ptr GraphGroup::collectStats(Ptr graph, do { size_t current = (start + end) / 2; auto batch = data::CorpusBatch::fakeBatch(lengths, vocabs, current, options_); - auto loss = model->build(graph, batch); - fits = graph->fits(); + + // Same as above. + try { + auto loss = model->build(graph, batch); + fits = graph->fits(); + } catch(const ShapeSizeException& e) { + LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what()); + fits = false; + } LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits); diff --git a/src/translator/translator.h b/src/translator/translator.h index 75b5070b3..3103e7ddc 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -98,7 +98,7 @@ class Translate : public ModelTask { graph->getBackend()->setGemmType(options_->get("gemm-type")); graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); } - graph->reserveWorkspaceMB(options_->get("workspace")); + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; std::vector> scorers; @@ -311,7 +311,7 @@ class TranslateService : public ModelServiceTask { graph->getBackend()->setGemmType(options_->get("gemm-type")); graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); } - graph->reserveWorkspaceMB(options_->get("workspace")); + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); auto scorers = createScorers(options_, model_items_); From e4f3d0f740829bd3bb9c745a8faee1d102487775 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 9 May 2022 13:28:28 -0700 Subject: [PATCH 172/254] add fallback option for sampling, for back-compat --- src/models/model_factory.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 52a87e72a..394344f72 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -374,7 +374,10 @@ Ptr createModelFromOptions(Ptr options, usage use) { auto sampling = options->get>("output-sampling", {}); std::string method = sampling.size() > 0 ? sampling[0] : "full"; - if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { + if(method == "0") { /*for backwards-compat when output-sampling: false in yaml file*/ + // do normal decoding + return New(std::dynamic_pointer_cast(baseModel), New()); + } else if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { LOG(info, "Output sampling from the full softmax distribution"); return New(std::dynamic_pointer_cast(baseModel), New()); } else if(method == "topk") { From e0e3287a3bc64d8023f2fbc972b66a92413cd4b9 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 12 May 2022 16:23:58 +0000 Subject: [PATCH 173/254] Merged PR 23840: Update CUDA installation script for Ubuntu Updates CUDA deb/key fetching https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ --- scripts/ci/install_cuda_ubuntu.sh | 87 ++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index de60a5b65..41fcee463 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -17,6 +17,33 @@ if [[ $# -lt 1 ]]; then exit 2 fi + +## ------------------------------------------------------------------ +## Bash functions +## ------------------------------------------------------------------ + +# returns 0 (true) if a >= b +function version_ge() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$2" ] +} +# returns 0 (true) if a > b +function version_gt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_ge $1 $2 +} +# returns 0 (true) if a <= b +function version_le() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$(printf '%s\n' "$@" | sort -V | head -n 1)" == "$1" ] +} +# returns 0 (true) if a < b +function version_lt() { + [ "$#" != "2" ] && echo "${FUNCNAME[0]} requires exactly 2 arguments." && exit 1 + [ "$1" = "$2" ] && return 1 || version_le $1 $2 +} + + ## ------------------------------------------------------------------ ## Find CUDA and OS versions ## ------------------------------------------------------------------ @@ -53,22 +80,52 @@ fi ## Select CUDA packages to install ## ------------------------------------------------------------------ +# Ideally choose from the list of meta-packages to minimise variance between cuda versions (although it does change too). Some of these packages may not be availble pre cuda 10. CUDA_PACKAGES_IN=( - "command-line-tools" - "libraries-dev" + "cuda-compiler" + "cuda-cudart-dev" + "cuda-nvtx" + "cuda-nvrtc-dev" + "libcublas-dev" + "libcurand-dev" # 11-0+ + "libcusparse-dev" # 11-0+ + "cuda-cccl" # 11.4+, provides cub and thrust. On 11.3 knwon as cuda-thrust-11-3 ) CUDA_PACKAGES="" for package in "${CUDA_PACKAGES_IN[@]}"; do # @todo This is not perfect. Should probably provide a separate list for diff versions # cuda-compiler-X-Y if CUDA >= 9.1 else cuda-nvcc-X-Y - if [[ "${package}" == "nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then - package="compiler" - elif [[ "${package}" == "compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then - package="nvcc" + if [[ "${package}" == "cuda-nvcc" ]] && version_ge "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + package="cuda-compiler" + elif [[ "${package}" == "cuda-compiler" ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "9.1" ; then + package="cuda-nvcc" + # CUB/Thrust are packages in cuda-thrust in 11.3, but cuda-cccl in 11.4+ + elif [[ "${package}" == "cuda-thrust" || "${package}" == "cuda-cccl" ]]; then + # CUDA cuda-thrust >= 11.4 + if version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.4" ; then + package="cuda-cccl" + # Use cuda-thrust > 11.2 + elif version_ge "$CUDA_VERSION_MAJOR_MINOR" "11.3" ; then + package="cuda-thrust" + # Do not include this pacakge < 11.3 + else + continue + fi + fi + # CUDA 11+ includes lib* / lib*-dev packages, which if they existed previously where cuda-cu*- / cuda-cu*-dev- + if [[ ${package} == libcu* ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "11.0" ; then + if [[ ${package} != libcublas* ]]; then + package="${package/libcu/cuda-cu}" + fi + fi + + if [[ ${package} == libcublas* ]] && version_lt "$CUDA_VERSION_MAJOR_MINOR" "11.0" ; then + CUDA_PACKAGES+=" ${package}" + else + # Build the full package name and append to the string. + CUDA_PACKAGES+=" ${package}-${CUDA_MAJOR}-${CUDA_MINOR}" fi - # Build the full package name and append to the string. - CUDA_PACKAGES+=" cuda-${package}-${CUDA_MAJOR}-${CUDA_MINOR}" done echo "CUDA_PACKAGES ${CUDA_PACKAGES}" @@ -77,14 +134,18 @@ echo "CUDA_PACKAGES ${CUDA_PACKAGES}" ## Prepare to install ## ------------------------------------------------------------------ +CPU_ARCH="x86_64" PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin" -PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/${PIN_FILENAME}" -APT_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub" -REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/" +PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${PIN_FILENAME}" +# apt keyring package now available https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ +KERYRING_PACKAGE_FILENAME="cuda-keyring_1.0-1_all.deb" +KEYRING_PACKAGE_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/${KERYRING_PACKAGE_FILENAME}" +REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/${CPU_ARCH}/" echo "PIN_FILENAME ${PIN_FILENAME}" echo "PIN_URL ${PIN_URL}" -echo "APT_KEY_URL ${APT_KEY_URL}" +echo "KEYRING_PACKAGE_URL ${KEYRING_PACKAGE_URL}" +echo "REPO_URL ${REPO_URL}" ## ------------------------------------------------------------------ ## Install CUDA @@ -93,7 +154,7 @@ echo "APT_KEY_URL ${APT_KEY_URL}" echo "Adding CUDA Repository" wget ${PIN_URL} sudo mv ${PIN_FILENAME} /etc/apt/preferences.d/cuda-repository-pin-600 -sudo apt-key adv --fetch-keys ${APT_KEY_URL} +wget ${KEYRING_PACKAGE_URL} && sudo dpkg -i ${KERYRING_PACKAGE_FILENAME} && rm ${KERYRING_PACKAGE_FILENAME} sudo add-apt-repository "deb ${REPO_URL} /" sudo apt-get update From 704a323142c44e80ff21607e5603324d7b9a3f37 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 13 May 2022 07:30:36 +0000 Subject: [PATCH 174/254] Merged PR 22799: Running regression tests on Azure Pipelines This PR adds an Azure Pipeline for running regression tests on an Azure Hosted GPU Pool. It currently run on Ubuntu 18.04, GCC 8, CUDA 11.1, a single Nvidia M60 GPU device (Maxwell). The pipeline needs to be started manually: go to "Pipelines", then "Marian GPU Pool", click "Run pipeline", select the branch, click "Run". --- azure-regression-tests.yml | 126 +++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 azure-regression-tests.yml diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml new file mode 100644 index 000000000..d6053e53c --- /dev/null +++ b/azure-regression-tests.yml @@ -0,0 +1,126 @@ +# Azure pipelines for Marian NMT +# +# The pipeline need to be added manually to the repository, for example: +# 1. Go to Your repository > Pipelines, click "New pipeline" +# 2. Choose "Azure Repos Git" and a repository +# 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file +# 4. "More actions" > "Save" + +# The pipeline has no CI trigger and needs to be started manually, for example: +# 1. Got to the Pipeline created above +# 2. Click "Run pipeline" and select a "Branch/tag" you want to run it with +trigger: none + +# Hosted Azure DevOps Pool determining OS, CUDA version and available GPUs +pool: mariandevops-pool-m60-eus + +stages: +- stage: TestsGPU + jobs: + + ###################################################################### + - job: TestsGPULinux + displayName: Linux GPU tests + timeoutInMinutes: 120 + + steps: + - checkout: self + submodules: true + + # librt.* from the default anaconda environment are deleted because they crash the linker at the + # end of compilation. This is an issue with the pre-defined VM image that is used for the Pool + # and will not persist for other images + # TODO: There should be no need to install python3 + - bash: | + rm -f /anaconda/envs/py38_default/x86_64-conda-linux-gnu/sysroot/usr/lib/librt.* + sudo apt-get install -y gcc-8 g++-8 p7zip-full python3-pip + displayName: Clean and install packages + + # Collect details about CPU and GPU. + # Because the outputs goes into regression-tests/*.log files, they will be included in the artifact. + - bash: | + echo ">>> lscpu" + lscpu | tee lscpu.log + echo ">>> cpuinfo" + cat /proc/cpuinfo | tee cpuinfo.log + /usr/bin/gcc-8 --version | tee gcc.log + echo ">>> nvidia-smi" + nvidia-smi | tee nvidia-smi.log + echo ">>> python" + which python3 | tee python.log + python3 --version | tee -a python.log + python3 -m pip --version | tee -a python.log + echo ">>> df" + df -h | tee df.log + displayName: Collect system info + workingDirectory: regression-tests + + # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html + - bash: | + wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - + sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" + sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" + sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 + displayName: Install MKL + + - bash: | + mkdir -p build + cd build + CC=/usr/bin/gcc-8 CXX=/usr/bin/g++-8 CUDAHOSTCXX=/usr/bin/g++-8 \ + cmake .. \ + -DCOMPILE_CPU=on \ + -DUSE_FBGEMM=on \ + -DCOMPILE_CUDA=on \ + -DDETERMINISTIC=on \ + -DUSE_STATIC_LIBS=on \ + -DCOMPILE_EXAMPLES=on \ + -DCOMPILE_SERVER=on \ + -DCOMPILE_TESTS=on \ + -DCOMPILE_MAXWELL=on -DCOMPILE_PASCAL=off -DCOMPILE_VOLTA=off -DCOMPILE_TURING=off -DCOMPILE_AMPERE=off -DCOMPILE_AMPERE_RTX=off \ + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1 + displayName: Configure CMake + + - bash: make -j5 + displayName: Compile + workingDirectory: build + + - bash: | + ./marian --version + ./marian-decoder --version + ./marian-scorer --version + ./spm_encode --version + displayName: Print versions + workingDirectory: build + + # Run unit tests with verbose output + - bash: ctest --verbose --output-on-failure + displayName: Run unit tests + workingDirectory: build + + # Always run regression tests from the master branch + - bash: | + git checkout master + git pull origin master + make install + displayName: Prepare regression tests + workingDirectory: regression-tests + + # Continue on error to be able to collect outputs and publish them as an artifact + - bash: MARIAN=../build ./run_mrt.sh + continueOnError: true + displayName: Run regression tests + workingDirectory: regression-tests + + - bash: | + nvidia-smi + # cut -c3- removes './' from paths making 7z to retain the directory structure + find . -type f \( -name "*.log" -o -name "*.out" -o -name "*.diff" \) -print | cut -c3- > listing.txt + echo "Creating an artifact with the following files:" + cat listing.txt + 7z a -tzip ../regression-tests-ci-public_linux-x64-static_cuda_m60.zip @listing.txt + displayName: Collect outputs + workingDirectory: regression-tests + + - publish: regression-tests-ci-public_linux-x64-static_cuda_m60.zip + artifact: regression-tests-ci-public_linux-x64-static_cuda_m60 + displayName: Publish outputs From 95720ae19fa21b1726787fb2db57535cafba84fa Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Wed, 18 May 2022 11:11:28 +0100 Subject: [PATCH 175/254] Update NVIDIA CUDA signing key for CI; fix for building docs (#932) * Update NVIDIA CUDA signing key for CI * Constrain Jinja2 to build docs --- doc/requirements.txt | 1 + scripts/ci/install_cuda_ubuntu.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/requirements.txt b/doc/requirements.txt index a2416e9a1..40de5ddd9 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -6,3 +6,4 @@ myst-parser==0.14.0a3 mistune<2.0.0 m2r sphinx-mathjax-offline +Jinja2<3.1 diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index de60a5b65..d8f7da414 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -79,7 +79,7 @@ echo "CUDA_PACKAGES ${CUDA_PACKAGES}" PIN_FILENAME="cuda-ubuntu${UBUNTU_VERSION}.pin" PIN_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/${PIN_FILENAME}" -APT_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/7fa2af80.pub" +APT_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/3bf863cc.pub" REPO_URL="https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/" echo "PIN_FILENAME ${PIN_FILENAME}" From 042ed8f2e23557d0cdb956aea7d79be8c817e0b0 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 30 May 2022 07:27:15 +0000 Subject: [PATCH 176/254] Merged PR 24072: Revert changes to transformer caching This PR reverts changes to transformer caching (public PR https://github.com/marian-nmt/marian-dev/pull/881) It seems to cause catastrophic memory leaks or incorrect de-allocation during decoding. --- src/CMakeLists.txt | 1 - src/common/hash.cpp | 12 ------- src/common/hash.h | 24 +++----------- src/models/transformer.h | 69 ++++++++++++++++++++-------------------- 4 files changed, 39 insertions(+), 67 deletions(-) delete mode 100644 src/common/hash.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3718807a5..e4599c407 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,7 +30,6 @@ set(MARIAN_SOURCES common/filesystem.cpp common/file_stream.cpp common/file_utils.cpp - common/hash.cpp common/signal_handling.cpp common/types.cpp diff --git a/src/common/hash.cpp b/src/common/hash.cpp deleted file mode 100644 index 57e5e9145..000000000 --- a/src/common/hash.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include - -#include "hash.h" -#include "common/shape.h" - -namespace std { -size_t hash>::operator()(pair const& k) const { - size_t seed = hash{}(k.first); - marian::util::hash_combine(seed, k.second.hash()); - return seed; -} -} // namespace std diff --git a/src/common/hash.h b/src/common/hash.h index 37dab5e76..7aca30de2 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -7,18 +7,16 @@ namespace util { template using hash = std::hash; -/** - * Combine hash values. - * This combinator is based on boost::hash_combine, but uses std::hash as the hash implementation. - * Used as a drop-in replacement for boost::hash_combine. - */ +// This combinator is based on boost::hash_combine, but uses +// std::hash as the hash implementation. Used as a drop-in +// replacement for boost::hash_combine. template inline void hash_combine(HashType& seed, T const& v) { hash hasher; seed ^= static_cast(hasher(v)) + 0x9e3779b9 + (seed<<6) + (seed>>2); } -/** Hash a whole chunk of memory. */ +// Hash a whole chunk of memory, mostly used for diagnostics template inline HashType hashMem(const T* beg, size_t len) { HashType seed = 0; @@ -27,17 +25,5 @@ inline HashType hashMem(const T* beg, size_t len) { return seed; } -} // namespace util - -struct Shape; // Forward declaration -} // namespace marian - -namespace std { -/** - * std::hash specialization for the string-shape pair used as a cache key in transformer.h. - */ -template <> -struct hash> { - size_t operator()(pair const& k) const; -}; +} } diff --git a/src/models/transformer.h b/src/models/transformer.h index 95a55d3aa..d87594e0e 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -5,7 +5,6 @@ #include "marian.h" -#include "common/hash.h" #include "layers/constructors.h" #include "models/decoder.h" #include "models/encoder.h" @@ -29,7 +28,7 @@ class Transformer : public EncoderOrDecoderBase { protected: using Base::options_; using Base::inference_; using Base::batchIndex_; using Base::graph_; - std::unordered_map, Expr> cache_; // caching transformation of the encoder that should not be created again + std::unordered_map cache_; // caching transformation of the encoder that should not be created again mutable/*lazy*/ std::vector sinusoidalEmbeddingsFreq_, sinusoidalEmbeddingsOffs_; // cached contributions to sinusoidal embeddings bool depthScaling_{false}; // As recommended in the GPT-2 paper, down-scale layer weights by a factor of 1 / sqrt(depth); @@ -41,16 +40,16 @@ class Transformer : public EncoderOrDecoderBase { std::vector alignments_; // [max tgt len or 1][beam depth, max src length, batch size, 1] // @TODO: make this go away - template - T opt(const char* const key) const { Ptr options = options_; return options->get(key); } + template + T opt(const char* const key) const { Ptr options = options_; return options->get(key); } - template - T opt(const std::string& key) const { return opt(key.c_str()); } + template + T opt(const std::string& key) const { return opt(key.c_str()); } - template + template T opt(const char* const key, const T& def) const { Ptr options = options_; return options->get(key, def); } - template + template T opt(const std::string& key, const T& def) const { opt(key.c_str(), def); } public: @@ -256,7 +255,7 @@ class Transformer : public EncoderOrDecoderBase { // take softmax along src sequence axis (-1) auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] - + if(saveAttentionWeights) collectOneHead(weights, dimBeam); @@ -289,26 +288,26 @@ class Transformer : public EncoderOrDecoderBase { // Caching transformation of the encoder that should not be created again. // @TODO: set this automatically by memoizing encoder context and // memoization propagation (short-term) - std::pair, Expr>::iterator, bool> cache_result; - if (cache - && !((cache_result = cache_.insert(std::pair, Expr>({prefix + "_keys", keys->shape()}, kh))).second) - ) { - kh = cache_result.first->second; - } else { + if (cache // if caching + && cache_.count(prefix + "_keys") > 0 // and the keys expression has been seen + && cache_[prefix + "_keys"]->shape().elements() == keys->shape().elements()) { // and the underlying element size did not change + kh = cache_[prefix + "_keys"]; // then return cached tensor + } + else { int dimKeys = keys->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation auto Wk = graph_->param(prefix + "_Wk", {dimKeys, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bk = graph_->param(prefix + "_bk", {1, dimModel}, inits::zeros()); kh = affine(keys, Wk, bk); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] kh = SplitHeads(kh, dimHeads); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim] - if (cache) cache_result.first->second = kh; + cache_[prefix + "_keys"] = kh; } Expr vh; - if (cache - && !((cache_result = cache_.insert(std::pair, Expr>({prefix + "_values", values->shape()}, vh))).second) - ) { - vh = cache_result.first->second; + if (cache + && cache_.count(prefix + "_values") > 0 + && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) { + vh = cache_[prefix + "_values"]; } else { int dimValues = values->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation auto Wv = graph_->param(prefix + "_Wv", {dimValues, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); @@ -316,7 +315,7 @@ class Transformer : public EncoderOrDecoderBase { vh = affine(values, Wv, bv); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim] vh = SplitHeads(vh, dimHeads); - if (cache) cache_result.first->second = vh; + cache_[prefix + "_values"] = vh; } int dimBeam = q->shape()[-4]; @@ -383,7 +382,7 @@ class Transformer : public EncoderOrDecoderBase { // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); - + auto opsPost = opt("transformer-postprocess"); output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb); @@ -573,7 +572,7 @@ class EncoderTransformer : public Transformer { auto embeddingLayer = getEmbeddingLayer(opt("ulr", false)); std::tie(batchEmbeddings, batchMask) = embeddingLayer->apply((*batch)[batchIndex_]); batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch); - + // reorganize batch and timestep batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim] batchMask = atleast_nd(batchMask, 4); // [beam depth=1, max length, batch size, vector dim=1] @@ -608,7 +607,7 @@ class EncoderTransformer : public Transformer { } // this allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); layer = postProcess(prefix_ + "_top", opsTop, layer, prevLayer, dropProb); @@ -637,14 +636,14 @@ class TransformerState : public DecoderState { int beamSize) const override { // @TODO: code duplication with DecoderState only because of isBatchMajor=true, should rather be a contructor argument of DecoderState? - + std::vector> newEncStates; - for(auto& es : encStates_) - // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries + for(auto& es : encStates_) + // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); // Create hypothesis-selected state based on current state and hyp indices - auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); + auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); // Set the same target token position as the current state // @TODO: This is the same as in base function. @@ -778,8 +777,8 @@ class DecoderTransformer : public Transformer { // This would happen if something goes wrong during batch pruning. ABORT_IF(encoderContext->shape()[-3] != dimBatch, - "Context and query batch dimension do not match {} != {}", - encoderContext->shape()[-3], + "Context and query batch dimension do not match {} != {}", + encoderContext->shape()[-3], dimBatch); // LayerAttention expects mask in a different layout @@ -886,7 +885,7 @@ class DecoderTransformer : public Transformer { } // This allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); query = postProcess(prefix_ + "_top", opsTop, query, prevQuery, dropProb); @@ -899,7 +898,7 @@ class DecoderTransformer : public Transformer { if(shortlist_) output_->setShortlist(shortlist_); auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] - + // return unormalized(!) probabilities Ptr nextState; if (opt("transformer-decoder-autoreg", "self-attention") == "rnn") { @@ -924,9 +923,9 @@ class DecoderTransformer : public Transformer { output_->clear(); cache_.clear(); alignments_.clear(); - perLayerRnn_.clear(); // this needs to be cleared between batches. - // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, - // but where underlying memory has been deallocated by dropping all tensors + perLayerRnn_.clear(); // this needs to be cleared between batches. + // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, + // but where underlying memory has been deallocated by dropping all tensors // from a TensorAllocator object. This can happen during ExpressionGraph::clear() } }; From 5df240f534ab6124b8fd9fb8e18b116970323fde Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 31 May 2022 12:38:47 +0100 Subject: [PATCH 177/254] Update status badges (#935) --- README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7fa003e19..a8d84c2af 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ Marian ====== - -[![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDA%2010.2)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/) -[![Build Status CUDA 11](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-11.4.svg?label=CUDA%2011.4)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-11.4/) -[![Build Status CPU](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/) -[![Tests Status](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=tests)](http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/) +[![Ubuntu](https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml) +[![Windows](https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml) +[![MacOS](https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml) [![Latest release](https://img.shields.io/github/release/marian-nmt/marian.svg?label=release)](https://github.com/marian-nmt/marian/releases) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE.md) [![Twitter](https://img.shields.io/twitter/follow/marian_nmt.svg?style=social)](https://twitter.com/intent/follow?screen_name=marian_nmt) From c5081df93fe31a8b77ec97b8d5989ed079dd60a7 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 31 May 2022 15:31:39 +0000 Subject: [PATCH 178/254] Merged PR 24111: Remove external reference to Docker images The reference to docker.io triggers a security warning (https://eng.ms/docs/more/containers-secure-supply-chain) making our pipelines flashing orange, which cover the real status of regression testing. This PR simply replaced the external reference to an internal mirror (https://eng.ms/docs/more/containers-secure-supply-chain/approved-images). --- contrib/triton-aml/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/contrib/triton-aml/Dockerfile b/contrib/triton-aml/Dockerfile index f2d29a5f6..0d0ed707b 100644 --- a/contrib/triton-aml/Dockerfile +++ b/contrib/triton-aml/Dockerfile @@ -1,5 +1,6 @@ # It is recommended to use a machine which supports CUDA to build this image. -FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 AS BUILDER +# Remove 'mcr.microsoft.com/mirror/nvcr/' below if you want to pull from the official NVIDIA image +FROM mcr.microsoft.com/mirror/nvcr/nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 AS BUILDER RUN apt-get update --fix-missing RUN apt-get install -y curl git autoconf automake libtool curl make g++ unzip cmake build-essential cpio RUN apt-get -y clean && \ @@ -58,7 +59,8 @@ RUN cmake .. -DCOMPILE_CPU=on -DCOMPILE_CUDA=on -DUSE_SENTENCEPIECE=on -DUSE_STA RUN make -j $(grep -c ^processor /proc/cpuinfo) # build cmarian static library -FROM nvcr.io/nvidia/tritonserver:20.09-py3 +# Replace 'mcr.microsoft.com/mirror/nvcr/' with 'nvcr.io/' below if you want to pull from the official NVIDIA image. Tested using 'nvidia/tritonserver:20.09-py3'. +FROM mcr.microsoft.com/mirror/nvcr/nvidia/tritonserver:22.03-py3 RUN mkdir -p /marian-dev/build/src/3rd_party/sentencepiece/src COPY --from=BUILDER /usr/lib/libprotobuf.a /usr/lib COPY --from=BUILDER /usr/lib/libboost_system.a /usr/lib From e27da623938b84f9abe600774af6fad4fd5f1dd6 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Mon, 6 Jun 2022 13:32:58 +0100 Subject: [PATCH 179/254] Directory listing in Ubuntu and macOS workflows (#938) --- .github/workflows/macos.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 20907d9b6..c16213793 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -48,4 +48,4 @@ jobs: ./marian-decoder --version ./marian-scorer --version ./spm_encode --version - + ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \)) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 4a0fa6746..902c24569 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -122,4 +122,4 @@ jobs: ./marian-scorer --version ./marian-server --version ./spm_encode --version - + ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \)) From a90950ea25f03eb753c12c709c60dece75e6658d Mon Sep 17 00:00:00 2001 From: Alex Muzio Date: Wed, 10 Aug 2022 22:23:47 +0000 Subject: [PATCH 180/254] Merged PR 25154: Add model shapes flag to model_info.py script Add model shapes flag to model_info.py script through `--matrix-shapes` flag This will print something like: ``` ... encoder_l6_ffn_W1 (1024, 4096) encoder_l6_ffn_W2 (4096, 1024) encoder_l6_ffn_b1 (1, 4096) encoder_l6_ffn_b2 (1, 1024) encoder_l6_ffn_ffn_ln_bias (1, 1024) encoder_l6_ffn_ffn_ln_scale (1, 1024) encoder_l6_self_Wk (1024, 1024) encoder_l6_self_Wo (1024, 1024) encoder_l6_self_Wo_ln_bias (1, 1024) encoder_l6_self_Wo_ln_scale (1, 1024) encoder_l6_self_Wq (1024, 1024) encoder_l6_self_Wv (1024, 1024) encoder_l6_self_bk (1, 1024) encoder_l6_self_bo (1, 1024) encoder_l6_self_bq (1, 1024) encoder_l6_self_bv (1, 1024) special:model.yml (1264,) ``` --- scripts/contrib/model_info.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/contrib/model_info.py b/scripts/contrib/model_info.py index 1a022e188..3c5730844 100755 --- a/scripts/contrib/model_info.py +++ b/scripts/contrib/model_info.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import sys import argparse import numpy as np import yaml @@ -44,7 +43,10 @@ def main(): print(model[args.key]) else: for key in model: - print(key) + if args.matrix_shapes: + print(key, model[key].shape) + else: + print(key) def parse_args(): @@ -54,7 +56,9 @@ def parse_args(): parser.add_argument("-s", "--special", action="store_true", help="print values from special:model.yml node") parser.add_argument("-f", "--full-matrix", action="store_true", - help="force numpy to print full arrays") + help="force numpy to print full arrays for single key") + parser.add_argument("-ms", "--matrix-shapes", action="store_true", + help="print shapes of all arrays in the model") return parser.parse_args() From 5d466bc367b2069834a44933ff13479d39cf4ae3 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 2 Sep 2022 05:55:20 +0000 Subject: [PATCH 181/254] Merged PR 25507: Upgrade Azure Pipelines to ubuntu-20.04 Ubuntu-18.04 will not be supported after October 2022. --- azure-pipelines.yml | 56 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 192f0c871..a6fea5da6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -190,7 +190,7 @@ stages: - job: BuildUbuntu condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu - timeoutInMinutes: 90 + timeoutInMinutes: 120 # Minimal tested configurations for marian-dev v1.11 and C++17: # * Ubuntu 16.04, GCC 7.5, CMake 3.10.2, CUDA 9.2 (probably GCC 6 would work too) @@ -200,51 +200,50 @@ stages: ################################################################ # Ubuntu CPU-only build "CPU-only": - image: ubuntu-18.04 + image: ubuntu-20.04 boost: true cpu: true gpu: false cuda: "" - gcc: 7 + gcc: 9 unit_tests: true examples: false static: true # Ubuntu GPU-only build "GPU-only": - image: ubuntu-18.04 + image: ubuntu-20.04 boost: true cpu: false gpu: true - cuda: 10.2 - gcc: 7 + cuda: 11.1 + gcc: 9 unit_tests: false examples: false static: false ################################################################ - # Ubuntu 20.04 supports CUDA 11+ + # Ubuntu 22.04 supports CUDA 11+ # # CPU is disabled because FBGEMM + GCC 9+ do not compile on machines with # avx512_vnni, see https://github.com/marian-nmt/marian-dev/issues/709 - # - "20.04 CUDA 11.1 gcc-9": - image: ubuntu-20.04 - boost: false # ubuntu-20.04 does not have Boost pre-installed yet - cpu: false # the used fbgemm does not compile with gcc 9+ + "22.04 CUDA 11.7 gcc-11": + image: ubuntu-22.04 + boost: false + cpu: false gpu: true - cuda: 11.1 - gcc: 9 + cuda: 11.7 + gcc: 11 unit_tests: false # disable unit tests to minimize compilation time examples: false # disable examples to minimize compilation time static: false ################################################################ - # Ubuntu 18.04 supports CUDA 10.1+ - "18.04 CUDA 10.2 gcc-8": - image: ubuntu-18.04 + # Ubuntu 20.04 supports CUDA 11+ + "20.04 CUDA 11.1 gcc-9": + image: ubuntu-20.04 boost: true cpu: true gpu: true - cuda: 10.2 - gcc: 8 + cuda: 11.1 + gcc: 9 unit_tests: true examples: true static: true @@ -260,8 +259,9 @@ stages: # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - # GCC 8 and lower are no longer pre-installed - - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-$(gcc) g++-$(gcc) + # Note that installation of libunwind-dev is a bug fix for ubuntu-22.04 images on Azure/GitHub-hosted machines + # and is normally not required + - bash: sudo apt-get install -y libunwind-dev libgoogle-perftools-dev libprotobuf-dev protobuf-compiler displayName: Install packages # Boost is no longer pre-installed on Azure/GitHub-hosted runners @@ -377,7 +377,7 @@ stages: displayName: Linux CPU library install pool: - vmImage: ubuntu-18.04 + vmImage: ubuntu-20.04 steps: - checkout: self @@ -396,14 +396,14 @@ stages: sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 displayName: Install MKL - - bash: /usr/bin/gcc-7 --version + - bash: /usr/bin/gcc-9 --version displayName: Print GCC version - bash: | mkdir -p install mkdir -p build cd build - CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 \ + CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 \ cmake .. \ -DCMAKE_INSTALL_PREFIX=../install \ -DCMAKE_BUILD_TYPE=slim \ @@ -528,6 +528,7 @@ stages: displayName: Machine statistics workingDirectory: marian-prod-tests + # The current SAS token will expire on 8/30/2023 and a new one will need to be set in Marian > Pipelines > Library - bash: | cd models bash download-models.sh @@ -571,7 +572,7 @@ stages: # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-8 g++-8 + - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-9 g++-9 displayName: Install packages # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html @@ -588,7 +589,7 @@ stages: mkdir -p install mkdir -p build cd build - CC=/usr/bin/gcc-8 CXX=/usr/bin/g++-8 \ + CC=/usr/bin/gcc-9 CXX=/usr/bin/g++-9 \ cmake .. \ -DCMAKE_BUILD_TYPE=slim \ -DCOMPILE_CPU=on \ @@ -624,10 +625,11 @@ stages: lscpu | tee lscpu.log echo ">>> cpuinfo" cat /proc/cpuinfo | tee cpuinfo.log - /usr/bin/gcc-8 --version | tee gcc.log + /usr/bin/gcc-9 --version | tee gcc.log displayName: Machine statistics workingDirectory: marian-prod-tests + # The current SAS token will expire on 8/30/2023 and a new one will need to be set in Marian > Pipelines > Library - bash: | cd models bash download-models.sh From f9a1ed10ce408b1f3a9c1d197b74b5ac4b1b609f Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 2 Sep 2022 07:19:33 +0100 Subject: [PATCH 182/254] Add a workflow compiling Marian using clang-14 (#940) * Add a cpu-only compilation using clang-14 * Always install gcc/g++ * Use ubuntu-20.04 image * Define clang variable in all jobs --- .github/workflows/ubuntu.yml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 902c24569..a889df16c 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -16,6 +16,17 @@ jobs: os: ubuntu-18.04 cuda: "" gcc: 7 + clang: "" + cpu: true + gpu: false + unit_tests: true + examples: false + # Using Clang compiler + - name: "Ubuntu CPU-only clang-12" + os: ubuntu-20.04 + cuda: "" + gcc: "" + clang: 12 cpu: true gpu: false unit_tests: true @@ -25,6 +36,7 @@ jobs: os: ubuntu-18.04 cuda: "10.2" gcc: 7 + clang: "" cpu: false gpu: true unit_tests: false @@ -35,6 +47,7 @@ jobs: os: ubuntu-20.04 cuda: "11.2" gcc: 9 + clang: "" cpu: false gpu: true unit_tests: false @@ -45,6 +58,7 @@ jobs: os: ubuntu-18.04 cuda: "10.2" gcc: 8 + clang: "" cpu: true gpu: true unit_tests: false @@ -64,10 +78,11 @@ jobs: # The following packages are already installed on GitHub-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev # Boost is no longer pre-installed on GitHub-hosted runners + # Clang 12.0 is pre-installed on the ubuntu-20.04 image - name: Install dependencies run: | - sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev \ - gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} + sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev + [ -z "${{ matrix.gcc }}" ] || sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL @@ -86,9 +101,10 @@ jobs: # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671 - name: Configure CMake run: | + [ -z "${{ matrix.gcc }}" ] || export CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} + [ -z "${{ matrix.clang }}" ] || export CC=/usr/bin/clang-${{ matrix.clang }} CXX=/usr/bin/clang++-${{ matrix.clang }} mkdir -p build cd build - CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \ cmake .. \ -DBoost_ARCHITECTURE=-x64 \ -DCMAKE_BUILD_TYPE=Release \ From 6250cd88bb958c9aaebdd53ef4df3f7e98be04e7 Mon Sep 17 00:00:00 2001 From: KOLANICH Date: Fri, 2 Sep 2022 08:04:23 +0000 Subject: [PATCH 183/254] Fixed some warnings on clang 15 that are promoted into errors (#936) --- src/data/factored_vocab.cpp | 2 +- src/data/shortlist.h | 3 --- src/training/communicator.h | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index e05f31225..caee2e0c3 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -130,7 +130,7 @@ namespace marian { // @TODO: add checks for empty factor groups until it stops crashing (training already works; decoder still crashes) io::InputFileStream in(modelPath); - for (WordIndex v = 0; io::getline(in, line); v++) { + for(; io::getline(in, line);) { utils::splitAny(line, tokBuf, " \t"); factorMapTokenized.push_back(tokBuf); } diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 82b0df69a..bf185d570 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -221,7 +221,6 @@ class LexicalShortlistGenerator : public ShortlistGenerator { } void prune(float threshold = 0.f) { - size_t i = 0; for(auto& probs : data_) { std::vector> sorter; for(auto& it : probs) @@ -237,8 +236,6 @@ class LexicalShortlistGenerator : public ShortlistGenerator { else break; } - - ++i; } } diff --git a/src/training/communicator.h b/src/training/communicator.h index c24caadcd..5ab1b6b27 100644 --- a/src/training/communicator.h +++ b/src/training/communicator.h @@ -130,7 +130,6 @@ class DefaultCommunicator : public ICommunicator { int totalSize = (int)graphs_[0]->params()->vals()->size(); int shardSize = (int)ceil(totalSize / (float)graphs_.size()); - int pos = 0; for(auto graph : graphs_) { int __size__ = std::min(shardSize, totalSize); @@ -145,7 +144,6 @@ class DefaultCommunicator : public ICommunicator { tmpTensors_.push_back(tmp); // move to next shard - pos += __size__; totalSize -= __size__; } } From 3bd281c6c943630fd3a313d81bcadef4304294b6 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Fri, 2 Sep 2022 10:36:15 +0100 Subject: [PATCH 184/254] Fix clang 13.0.1 (#939) Co-authored-by: Roman Grundkiewicz --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01aea0251..1ddb632bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load - Fixed check for `fortran_ordering` in cnpy - Fixed fp16 training/inference with factors-combine concat method +- Fixed clang 13.0.1 compatibility ### Changed - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce From 650cf19e5aee51e7f33a2eb7e0846d634726bff2 Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Fri, 2 Sep 2022 10:36:43 +0100 Subject: [PATCH 185/254] Update Catch2 from 2.10.1 to 2.13.9 (#941) * Update Catch2 from 2.10.1 to 2.13.9 * Update CHANGELOG --- CHANGELOG.md | 1 + src/3rd_party/catch.hpp | 2013 ++++++++++++++++++++++++--------------- 2 files changed, 1257 insertions(+), 757 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ddb632bd..a6614909e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Faster LSH top-k search on CPU - Updated intgemm to the latest upstream version - Parameters in npz files are no longer implicitly assumed to be row-ordered. Non row-ordered parameters will result in an abort +- Updated Catch2 header from 2.10.1 to 2.13.9 ## [1.11.0] - 2022-02-08 diff --git a/src/3rd_party/catch.hpp b/src/3rd_party/catch.hpp index 5d104bc46..07efa655e 100644 --- a/src/3rd_party/catch.hpp +++ b/src/3rd_party/catch.hpp @@ -1,9 +1,9 @@ /* - * Catch v2.10.1 - * Generated: 2019-10-20 20:52:21.372334 + * Catch v2.13.9 + * Generated: 2022-04-12 22:37:23.260201 * ---------------------------------------------------------- * This file has been merged from multiple headers. Please don't edit it directly - * Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved. + * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. * * Distributed under the Boost Software License, Version 1.0. (See accompanying * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,8 +14,8 @@ #define CATCH_VERSION_MAJOR 2 -#define CATCH_VERSION_MINOR 10 -#define CATCH_VERSION_PATCH 1 +#define CATCH_VERSION_MINOR 13 +#define CATCH_VERSION_PATCH 9 #ifdef __clang__ # pragma clang system_header @@ -66,13 +66,16 @@ #if !defined(CATCH_CONFIG_IMPL_ONLY) // start catch_platform.h +// See e.g.: +// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html #ifdef __APPLE__ -# include -# if TARGET_OS_OSX == 1 -# define CATCH_PLATFORM_MAC -# elif TARGET_OS_IPHONE == 1 -# define CATCH_PLATFORM_IPHONE -# endif +# include +# if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ + (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) +# define CATCH_PLATFORM_MAC +# elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) +# define CATCH_PLATFORM_IPHONE +# endif #elif defined(linux) || defined(__linux) || defined(__linux__) # define CATCH_PLATFORM_LINUX @@ -132,42 +135,52 @@ namespace Catch { #endif -#if defined(CATCH_CPP17_OR_GREATER) -# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +// Only GCC compiler should be used in this block, so other compilers trying to +// mask themselves as GCC should be ignored. +#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) + +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) + #endif -#ifdef __clang__ +#if defined(__clang__) + +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) + +// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug +// which results in calls to destructors being emitted for each temporary, +// without a matching initialization. In practice, this can result in something +// like `std::string::~string` being called on an uninitialized value. +// +// For example, this code will likely segfault under IBM XL: +// ``` +// REQUIRE(std::string("12") + "34" == "1234") +// ``` +// +// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. +# if !defined(__ibmxl__) && !defined(__CUDACC__) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ +# endif + +# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ + _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") + +# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) + +# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) + +# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) + +# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) -# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ - _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") -# define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ - _Pragma( "clang diagnostic pop" ) #endif // __clang__ //////////////////////////////////////////////////////////////////////////////// @@ -225,11 +238,7 @@ namespace Catch { //////////////////////////////////////////////////////////////////////////////// // Visual C++ -#ifdef _MSC_VER - -# if _MSC_VER >= 1900 // Visual Studio 2015 or newer -# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS -# endif +#if defined(_MSC_VER) // Universal Windows platform does not support SEH // Or console colours (or console at all...) @@ -239,12 +248,20 @@ namespace Catch { # define CATCH_INTERNAL_CONFIG_WINDOWS_SEH # endif +# if !defined(__clang__) // Handle Clang masquerading for msvc + // MSVC traditional preprocessor needs some workaround for __VA_ARGS__ // _MSVC_TRADITIONAL == 0 means new conformant preprocessor // _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor -# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) -# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -# endif +# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) +# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +# endif // MSVC_TRADITIONAL + +// Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) +# endif // __clang__ + #endif // _MSC_VER #if defined(_REENTRANT) || defined(_MSC_VER) @@ -292,7 +309,7 @@ namespace Catch { #define CATCH_CONFIG_COLOUR_NONE #endif -#if defined(__UCLIBC__) +#if !defined(_GLIBCXX_USE_C99_MATH_TR1) #define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER #endif @@ -310,7 +327,10 @@ namespace Catch { // Check if byte is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # define CATCH_INTERNAL_CONFIG_CPP17_BYTE + # include + # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) + # define CATCH_INTERNAL_CONFIG_CPP17_BYTE + # endif # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if variant is available and usable @@ -353,10 +373,6 @@ namespace Catch { # define CATCH_CONFIG_CPP17_OPTIONAL #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) -# define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS -#endif - #if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) # define CATCH_CONFIG_CPP17_STRING_VIEW #endif @@ -397,34 +413,41 @@ namespace Catch { # define CATCH_CONFIG_GLOBAL_NEXTAFTER #endif +// Even if we do not think the compiler has that warning, we still have +// to provide a macro that can be used by the code. +#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +#endif +#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +#endif #if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS +#endif + +// The goal of this macro is to avoid evaluation of the arguments, but +// still have the compiler warn on problems inside... +#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) #endif #if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# undef CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #elif defined(__clang__) && (__clang_major__ < 5) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# undef CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) @@ -530,9 +553,10 @@ namespace Catch { } // end namespace Catch #define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_tag_alias_autoregistrar.h // start catch_test_registry.h @@ -578,49 +602,24 @@ namespace Catch { /// A non-owning string class (similar to the forthcoming std::string_view) /// Note that, because a StringRef may be a substring of another string, - /// it may not be null terminated. c_str() must return a null terminated - /// string, however, and so the StringRef will internally take ownership - /// (taking a copy), if necessary. In theory this ownership is not externally - /// visible - but it does mean (substring) StringRefs should not be shared between - /// threads. + /// it may not be null terminated. class StringRef { public: using size_type = std::size_t; using const_iterator = const char*; private: - friend struct StringRefTestAccess; - - char const* m_start; - size_type m_size; - - char* m_data = nullptr; - - void takeOwnership(); - static constexpr char const* const s_empty = ""; - public: // construction/ assignment - StringRef() noexcept - : StringRef( s_empty, 0 ) - {} - - StringRef( StringRef const& other ) noexcept - : m_start( other.m_start ), - m_size( other.m_size ) - {} + char const* m_start = s_empty; + size_type m_size = 0; - StringRef( StringRef&& other ) noexcept - : m_start( other.m_start ), - m_size( other.m_size ), - m_data( other.m_data ) - { - other.m_data = nullptr; - } + public: // construction + constexpr StringRef() noexcept = default; StringRef( char const* rawChars ) noexcept; - StringRef( char const* rawChars, size_type size ) noexcept + constexpr StringRef( char const* rawChars, size_type size ) noexcept : m_start( rawChars ), m_size( size ) {} @@ -630,27 +629,15 @@ namespace Catch { m_size( stdString.size() ) {} - ~StringRef() noexcept { - delete[] m_data; - } - - auto operator = ( StringRef const &other ) noexcept -> StringRef& { - delete[] m_data; - m_data = nullptr; - m_start = other.m_start; - m_size = other.m_size; - return *this; - } - explicit operator std::string() const { return std::string(m_start, m_size); } - void swap( StringRef& other ) noexcept; - public: // operators auto operator == ( StringRef const& other ) const noexcept -> bool; - auto operator != ( StringRef const& other ) const noexcept -> bool; + auto operator != (StringRef const& other) const noexcept -> bool { + return !(*this == other); + } auto operator[] ( size_type index ) const noexcept -> char { assert(index < m_size); @@ -658,41 +645,44 @@ namespace Catch { } public: // named queries - auto empty() const noexcept -> bool { + constexpr auto empty() const noexcept -> bool { return m_size == 0; } - auto size() const noexcept -> size_type { + constexpr auto size() const noexcept -> size_type { return m_size; } + // Returns the current start pointer. If the StringRef is not + // null-terminated, throws std::domain_exception auto c_str() const -> char const*; public: // substrings and searches - auto substr( size_type start, size_type size ) const noexcept -> StringRef; + // Returns a substring of [start, start + length). + // If start + length > size(), then the substring is [start, size()). + // If start > size(), then the substring is empty. + auto substr( size_type start, size_type length ) const noexcept -> StringRef; - // Returns the current start pointer. - // Note that the pointer can change when if the StringRef is a substring - auto currentData() const noexcept -> char const*; + // Returns the current start pointer. May not be null-terminated. + auto data() const noexcept -> char const*; - public: // iterators - const_iterator begin() const { return m_start; } - const_iterator end() const { return m_start + m_size; } + constexpr auto isNullTerminated() const noexcept -> bool { + return m_start[m_size] == '\0'; + } - private: // ownership queries - may not be consistent between calls - auto isOwned() const noexcept -> bool; - auto isSubstring() const noexcept -> bool; + public: // iterators + constexpr const_iterator begin() const { return m_start; } + constexpr const_iterator end() const { return m_start + m_size; } }; auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; - inline auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { + constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { return StringRef( rawChars, size ); } - } // namespace Catch -inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { +constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { return Catch::StringRef( rawChars, size ); } @@ -781,7 +771,7 @@ inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noex #define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) #define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) #define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) -#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _4, _5, _6) +#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) #define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) #define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) #define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) @@ -931,22 +921,33 @@ inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noex #include namespace Catch { -template -struct always_false : std::false_type {}; + template + struct always_false : std::false_type {}; + + template struct true_given : std::true_type {}; + struct is_callable_tester { + template + true_given()(std::declval()...))> static test(int); + template + std::false_type static test(...); + }; -template struct true_given : std::true_type {}; -struct is_callable_tester { - template - true_given()(std::declval()...))> static test(int); - template - std::false_type static test(...); -}; + template + struct is_callable; -template -struct is_callable; + template + struct is_callable : decltype(is_callable_tester::test(0)) {}; -template -struct is_callable : decltype(is_callable_tester::test(0)) {}; +#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 + // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is + // replaced with std::invoke_result here. + template + using FunctionReturnType = std::remove_reference_t>>; +#else + // Keep ::type here because we still support C++11 + template + using FunctionReturnType = typename std::remove_reference::type>::type>::type; +#endif } // namespace Catch @@ -1011,55 +1012,58 @@ struct AutoReg : NonCopyable { #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) #endif #endif /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \ static void TestName(); \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ static void TestName() #define INTERNAL_CATCH_TESTCASE( ... ) \ - INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), __VA_ARGS__ ) + INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), __VA_ARGS__ ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &QualifiedMethod ), CATCH_INTERNAL_LINEINFO, "&" #QualifiedMethod, Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ \ struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \ @@ -1067,19 +1071,21 @@ struct AutoReg : NonCopyable { }; \ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \ } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ void TestName::test() #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \ - INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, __VA_ARGS__ ) + INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), ClassName, __VA_ARGS__ ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags, Signature, ... )\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1095,7 +1101,7 @@ struct AutoReg : NonCopyable { int index = 0; \ constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\ using expander = int[];\ - (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1104,31 +1110,30 @@ struct AutoReg : NonCopyable { }();\ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature)) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template static void TestFuncName(); \ namespace {\ namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) { \ @@ -1142,7 +1147,7 @@ struct AutoReg : NonCopyable { constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\ constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\ constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */\ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */\ } \ }; \ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \ @@ -1153,29 +1158,28 @@ struct AutoReg : NonCopyable { }(); \ } \ } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ static void TestFuncName() #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T,__VA_ARGS__) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename T,__VA_ARGS__) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name, Tags, TmplList)\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template static void TestFunc(); \ @@ -1187,7 +1191,7 @@ struct AutoReg : NonCopyable { void reg_tests() { \ int index = 0; \ using expander = int[]; \ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */\ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\ } \ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \ @@ -1195,17 +1199,17 @@ struct AutoReg : NonCopyable { TestInit t; \ t.reg_tests(); \ return 0; \ - }(); \ + }(); \ }}\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ static void TestFunc() #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList) \ - INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, TmplList ) + INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, TmplList ) #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1221,7 +1225,7 @@ struct AutoReg : NonCopyable { int index = 0; \ constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\ using expander = int[];\ - (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1230,28 +1234,27 @@ struct AutoReg : NonCopyable { }();\ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS\ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS\ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS\ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature)) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes, TypesList)\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1271,7 +1274,7 @@ struct AutoReg : NonCopyable { constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\ constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\ constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1282,29 +1285,28 @@ struct AutoReg : NonCopyable { }(); \ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ void TestName::test() #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, TmplList) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template \ @@ -1319,7 +1321,7 @@ struct AutoReg : NonCopyable { void reg_tests(){\ int index = 0;\ using expander = int[];\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1329,13 +1331,12 @@ struct AutoReg : NonCopyable { return 0;\ }(); \ }}\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ void TestName::test() #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags, TmplList) \ - INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, TmplList ) + INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, TmplList ) // end catch_test_registry.h // start catch_capture.hpp @@ -1436,7 +1437,7 @@ namespace Catch { auto makeStream( StringRef const &filename ) -> IStream const*; - class ReusableStringStream { + class ReusableStringStream : NonCopyable { std::size_t m_index; std::ostream* m_oss; public: @@ -1824,8 +1825,8 @@ namespace Catch { #endif namespace Detail { - template - std::string rangeToString(InputIterator first, InputIterator last) { + template + std::string rangeToString(InputIterator first, Sentinel last) { ReusableStringStream rss; rss << "{ "; if (first != last) { @@ -1983,20 +1984,27 @@ namespace Catch { #endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER namespace Catch { - struct not_this_one {}; // Tag type for detecting which begin/ end are being selected - - // Import begin/ end from std here so they are considered alongside the fallback (...) overloads in this namespace + // Import begin/ end from std here using std::begin; using std::end; - not_this_one begin( ... ); - not_this_one end( ... ); + namespace detail { + template + struct void_type { + using type = void; + }; + + template + struct is_range_impl : std::false_type { + }; + + template + struct is_range_impl()))>::type> : std::true_type { + }; + } // namespace detail template - struct is_range { - static const bool value = - !std::is_same())), not_this_one>::value && - !std::is_same())), not_this_one>::value; + struct is_range : detail::is_range_impl { }; #if defined(_MANAGED) // Managed types are never ranges @@ -2364,6 +2372,18 @@ namespace Catch { auto operator <= ( RhsT const& rhs ) -> BinaryExpr const { return { static_cast(m_lhs <= rhs), m_lhs, "<=", rhs }; } + template + auto operator | (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs | rhs), m_lhs, "|", rhs }; + } + template + auto operator & (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs & rhs), m_lhs, "&", rhs }; + } + template + auto operator ^ (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs ^ rhs), m_lhs, "^", rhs }; + } template auto operator && ( RhsT const& ) -> BinaryExpr const { @@ -2444,7 +2464,7 @@ namespace Catch { virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0; virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0; - virtual auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0; + virtual auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0; #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) virtual void benchmarkPreparing( std::string const& name ) = 0; @@ -2682,15 +2702,16 @@ namespace Catch { /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \ do { \ + CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \ Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \ INTERNAL_CATCH_TRY { \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); \ - CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \ INTERNAL_CATCH_REACT( catchAssertionHandler ) \ - } while( (void)0, (false) && static_cast( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look - // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&. + } while( (void)0, (false) && static_cast( !!(__VA_ARGS__) ) ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \ @@ -2907,14 +2928,16 @@ namespace Catch { } // end namespace Catch #define INTERNAL_CATCH_SECTION( ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_section.h // start catch_interfaces_exception.h @@ -3005,6 +3028,9 @@ namespace Catch { {} std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override { +#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) + return ""; +#else try { if( it == itEnd ) std::rethrow_exception(std::current_exception()); @@ -3014,6 +3040,7 @@ namespace Catch { catch( T& ex ) { return m_translateFunction( ex ); } +#endif } protected: @@ -3032,9 +3059,10 @@ namespace Catch { /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \ static std::string translatorName( signature ); \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ static std::string translatorName( signature ) #define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature ) @@ -3065,7 +3093,7 @@ namespace Detail { Approx operator-() const; template ::value>::type> - Approx operator()( T const& value ) { + Approx operator()( T const& value ) const { Approx approx( static_cast(value) ); approx.m_epsilon = m_epsilon; approx.m_margin = m_margin; @@ -3281,9 +3309,10 @@ namespace Matchers { return description; } - MatchAllOf& operator && ( MatcherBase const& other ) { - m_matchers.push_back( &other ); - return *this; + MatchAllOf operator && ( MatcherBase const& other ) { + auto copy(*this); + copy.m_matchers.push_back( &other ); + return copy; } std::vector const*> m_matchers; @@ -3314,9 +3343,10 @@ namespace Matchers { return description; } - MatchAnyOf& operator || ( MatcherBase const& other ) { - m_matchers.push_back( &other ); - return *this; + MatchAnyOf operator || ( MatcherBase const& other ) { + auto copy(*this); + copy.m_matchers.push_back( &other ); + return copy; } std::vector const*> m_matchers; @@ -3573,12 +3603,12 @@ namespace Catch { namespace Matchers { namespace Vector { - template - struct ContainsElementMatcher : MatcherBase> { + template + struct ContainsElementMatcher : MatcherBase> { ContainsElementMatcher(T const &comparator) : m_comparator( comparator) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { for (auto const& el : v) { if (el == m_comparator) { return true; @@ -3594,12 +3624,12 @@ namespace Matchers { T const& m_comparator; }; - template - struct ContainsMatcher : MatcherBase> { + template + struct ContainsMatcher : MatcherBase> { - ContainsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} + ContainsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { // !TBD: see note in EqualsMatcher if (m_comparator.size() > v.size()) return false; @@ -3621,18 +3651,18 @@ namespace Matchers { return "Contains: " + ::Catch::Detail::stringify( m_comparator ); } - std::vector const& m_comparator; + std::vector const& m_comparator; }; - template - struct EqualsMatcher : MatcherBase> { + template + struct EqualsMatcher : MatcherBase> { - EqualsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} + EqualsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { // !TBD: This currently works if all elements can be compared using != // - a more general approach would be via a compare template that defaults - // to using !=. but could be specialised for, e.g. std::vector etc + // to using !=. but could be specialised for, e.g. std::vector etc // - then just call that directly if (m_comparator.size() != v.size()) return false; @@ -3644,15 +3674,15 @@ namespace Matchers { std::string describe() const override { return "Equals: " + ::Catch::Detail::stringify( m_comparator ); } - std::vector const& m_comparator; + std::vector const& m_comparator; }; - template - struct ApproxMatcher : MatcherBase> { + template + struct ApproxMatcher : MatcherBase> { - ApproxMatcher(std::vector const& comparator) : m_comparator( comparator ) {} + ApproxMatcher(std::vector const& comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { if (m_comparator.size() != v.size()) return false; for (std::size_t i = 0; i < v.size(); ++i) @@ -3679,16 +3709,14 @@ namespace Matchers { return *this; } - std::vector const& m_comparator; + std::vector const& m_comparator; mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom(); }; - template - struct UnorderedEqualsMatcher : MatcherBase> { - UnorderedEqualsMatcher(std::vector const& target) : m_target(target) {} - bool match(std::vector const& vec) const override { - // Note: This is a reimplementation of std::is_permutation, - // because I don't want to include inside the common path + template + struct UnorderedEqualsMatcher : MatcherBase> { + UnorderedEqualsMatcher(std::vector const& target) : m_target(target) {} + bool match(std::vector const& vec) const override { if (m_target.size() != vec.size()) { return false; } @@ -3699,7 +3727,7 @@ namespace Matchers { return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target); } private: - std::vector const& m_target; + std::vector const& m_target; }; } // namespace Vector @@ -3707,29 +3735,29 @@ namespace Matchers { // The following functions create the actual matcher objects. // This allows the types to be inferred - template - Vector::ContainsMatcher Contains( std::vector const& comparator ) { - return Vector::ContainsMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::ContainsMatcher Contains( std::vector const& comparator ) { + return Vector::ContainsMatcher( comparator ); } - template - Vector::ContainsElementMatcher VectorContains( T const& comparator ) { - return Vector::ContainsElementMatcher( comparator ); + template> + Vector::ContainsElementMatcher VectorContains( T const& comparator ) { + return Vector::ContainsElementMatcher( comparator ); } - template - Vector::EqualsMatcher Equals( std::vector const& comparator ) { - return Vector::EqualsMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::EqualsMatcher Equals( std::vector const& comparator ) { + return Vector::EqualsMatcher( comparator ); } - template - Vector::ApproxMatcher Approx( std::vector const& comparator ) { - return Vector::ApproxMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::ApproxMatcher Approx( std::vector const& comparator ) { + return Vector::ApproxMatcher( comparator ); } - template - Vector::UnorderedEqualsMatcher UnorderedEquals(std::vector const& target) { - return Vector::UnorderedEqualsMatcher(target); + template, typename AllocMatch = AllocComp> + Vector::UnorderedEqualsMatcher UnorderedEquals(std::vector const& target) { + return Vector::UnorderedEqualsMatcher( target ); } } // namespace Matchers @@ -3925,7 +3953,6 @@ namespace Generators { class SingleValueGenerator final : public IGenerator { T m_value; public: - SingleValueGenerator(T const& value) : m_value( value ) {} SingleValueGenerator(T&& value) : m_value(std::move(value)) {} T const& get() const override { @@ -3988,21 +4015,21 @@ namespace Generators { m_generators.emplace_back(std::move(generator)); } void populate(T&& val) { - m_generators.emplace_back(value(std::move(val))); + m_generators.emplace_back(value(std::forward(val))); } template void populate(U&& val) { - populate(T(std::move(val))); + populate(T(std::forward(val))); } template - void populate(U&& valueOrGenerator, Gs... moreGenerators) { + void populate(U&& valueOrGenerator, Gs &&... moreGenerators) { populate(std::forward(valueOrGenerator)); populate(std::forward(moreGenerators)...); } public: template - Generators(Gs... moreGenerators) { + Generators(Gs &&... moreGenerators) { m_generators.reserve(sizeof...(Gs)); populate(std::forward(moreGenerators)...); } @@ -4033,7 +4060,7 @@ namespace Generators { struct as {}; template - auto makeGenerators( GeneratorWrapper&& generator, Gs... moreGenerators ) -> Generators { + auto makeGenerators( GeneratorWrapper&& generator, Gs &&... moreGenerators ) -> Generators { return Generators(std::move(generator), std::forward(moreGenerators)...); } template @@ -4041,24 +4068,24 @@ namespace Generators { return Generators(std::move(generator)); } template - auto makeGenerators( T&& val, Gs... moreGenerators ) -> Generators { + auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators { return makeGenerators( value( std::forward( val ) ), std::forward( moreGenerators )... ); } template - auto makeGenerators( as, U&& val, Gs... moreGenerators ) -> Generators { + auto makeGenerators( as, U&& val, Gs &&... moreGenerators ) -> Generators { return makeGenerators( value( T( std::forward( val ) ) ), std::forward( moreGenerators )... ); } - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker&; + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker&; template // Note: The type after -> is weird, because VS2015 cannot parse // the expression used in the typedef inside, when it is in // return type. Yeah. - auto generate( SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval().get()) { + auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval().get()) { using UnderlyingType = typename decltype(generatorExpression())::type; - IGeneratorTracker& tracker = acquireGeneratorTracker( lineInfo ); + IGeneratorTracker& tracker = acquireGeneratorTracker( generatorName, lineInfo ); if (!tracker.hasGenerator()) { tracker.setGenerator(pf::make_unique>(generatorExpression())); } @@ -4071,11 +4098,17 @@ namespace Generators { } // namespace Catch #define GENERATE( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) #define GENERATE_COPY( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) #define GENERATE_REF( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) // end catch_generators.hpp // start catch_generators_generic.hpp @@ -4132,7 +4165,7 @@ namespace Generators { if (!m_predicate(m_generator.get())) { // It might happen that there are no values that pass the // filter. In that case we throw an exception. - auto has_initial_value = next(); + auto has_initial_value = nextImpl(); if (!has_initial_value) { Catch::throw_exception(GeneratorException("No valid value found in filtered generator")); } @@ -4144,6 +4177,11 @@ namespace Generators { } bool next() override { + return nextImpl(); + } + + private: + bool nextImpl() { bool success = m_generator.next(); if (!success) { return false; @@ -4241,18 +4279,7 @@ namespace Generators { } }; -#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 - // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is - // replaced with std::invoke_result here. Also *_t format is preferred over - // typename *::type format. - template - using MapFunctionReturnType = std::remove_reference_t>>; -#else - template - using MapFunctionReturnType = typename std::remove_reference::type>::type>::type; -#endif - - template > + template > GeneratorWrapper map(Func&& function, GeneratorWrapper&& generator) { return GeneratorWrapper( pf::make_unique>(std::forward(function), std::move(generator)) @@ -4438,6 +4465,7 @@ namespace Catch { } // end namespace Catch // end catch_option.hpp +#include #include #include #include @@ -4495,6 +4523,7 @@ namespace Catch { virtual int abortAfter() const = 0; virtual bool showInvisibles() const = 0; virtual ShowDurations::OrNot showDurations() const = 0; + virtual double minDuration() const = 0; virtual TestSpec const& testSpec() const = 0; virtual bool hasTestFilters() const = 0; virtual std::vector const& getTestsOrTags() const = 0; @@ -4508,6 +4537,7 @@ namespace Catch { virtual int benchmarkSamples() const = 0; virtual double benchmarkConfidenceInterval() const = 0; virtual unsigned int benchmarkResamples() const = 0; + virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0; }; using IConfigPtr = std::shared_ptr; @@ -4668,7 +4698,7 @@ class RangeGenerator final : public IGenerator { template GeneratorWrapper range(T const& start, T const& end, T const& step) { - static_assert(std::is_integral::value && !std::is_same::value, "Type must be an integer"); + static_assert(std::is_arithmetic::value && !std::is_same::value, "Type must be numeric"); return GeneratorWrapper(pf::make_unique>(start, end, step)); } @@ -5203,27 +5233,12 @@ namespace Catch { void addFilter(); bool separate(); - template - void addPattern() { - std::string token = m_patternName; - for( std::size_t i = 0; i < m_escapeChars.size(); ++i ) - token = token.substr( 0, m_escapeChars[i] - i ) + token.substr( m_escapeChars[i] -i +1 ); - m_escapeChars.clear(); - if( startsWith( token, "exclude:" ) ) { - m_exclusion = true; - token = token.substr( 8 ); - } - if( !token.empty() ) { - TestSpec::PatternPtr pattern = std::make_shared( token, m_substring ); - if( m_exclusion ) - pattern = std::make_shared( pattern ); - m_currentFilter.m_patterns.push_back( pattern ); - } - m_substring.clear(); - m_patternName.clear(); - m_exclusion = false; - m_mode = None; - } + // Handles common preprocessing of the pattern for name/tag patterns + std::string preprocessPattern(); + // Adds the current pattern as a test name + void addNamePattern(); + // Adds the current pattern as a tag + void addTagPattern(); inline void addCharToPattern(char c) { m_substring += c; @@ -5276,10 +5291,12 @@ namespace Catch { unsigned int benchmarkSamples = 100; double benchmarkConfidenceInterval = 0.95; unsigned int benchmarkResamples = 100000; + std::chrono::milliseconds::rep benchmarkWarmupTime = 100; Verbosity verbosity = Verbosity::Normal; WarnAbout::What warnings = WarnAbout::Nothing; ShowDurations::OrNot showDurations = ShowDurations::DefaultForReporter; + double minDuration = -1; RunTests::InWhatOrder runOrder = RunTests::InDeclarationOrder; UseColour::YesOrNo useColour = UseColour::Auto; WaitForKeypress::When waitForKeypress = WaitForKeypress::Never; @@ -5330,6 +5347,7 @@ namespace Catch { bool warnAboutMissingAssertions() const override; bool warnAboutNoTests() const override; ShowDurations::OrNot showDurations() const override; + double minDuration() const override; RunTests::InWhatOrder runOrder() const override; unsigned int rngSeed() const override; UseColour::YesOrNo useColour() const override; @@ -5341,6 +5359,7 @@ namespace Catch { int benchmarkSamples() const override; double benchmarkConfidenceInterval() const override; unsigned int benchmarkResamples() const override; + std::chrono::milliseconds benchmarkWarmupTime() const override; private: @@ -5446,6 +5465,8 @@ namespace Catch { } // namespace Catch // end catch_outlier_classification.hpp + +#include #endif // CATCH_CONFIG_ENABLE_BENCHMARKING #include @@ -5706,6 +5727,9 @@ namespace Catch { // Returns double formatted as %.3f (format expected on output) std::string getFormattedDuration( double duration ); + //! Should the reporter show + bool shouldShowDuration( IConfig const& config, double duration ); + std::string serializeFilters( std::vector const& container ); template @@ -6068,14 +6092,16 @@ namespace Catch { #if !defined(CATCH_CONFIG_DISABLE) #define CATCH_REGISTER_REPORTER( name, reporterType ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::ReporterRegistrar catch_internal_RegistrarFor##reporterType( name ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #define CATCH_REGISTER_LISTENER( listenerType ) \ - CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - namespace{ Catch::ListenerRegistrar catch_internal_RegistrarFor##listenerType; } \ - CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ + CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + namespace{ Catch::ListenerRegistrar catch_internal_RegistrarFor##listenerType; } \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #else // CATCH_CONFIG_DISABLE #define CATCH_REGISTER_REPORTER(name, reporterType) @@ -6097,8 +6123,6 @@ namespace Catch { static std::string getDescription(); - ReporterPreferences getPreferences() const override; - void noMatchingTestCases(std::string const& spec) override; void assertionStarting(AssertionInfo const&) override; @@ -6198,6 +6222,14 @@ namespace Catch { #include namespace Catch { + enum class XmlFormatting { + None = 0x00, + Indent = 0x01, + Newline = 0x02, + }; + + XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs); + XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs); class XmlEncode { public: @@ -6219,14 +6251,14 @@ namespace Catch { class ScopedElement { public: - ScopedElement( XmlWriter* writer ); + ScopedElement( XmlWriter* writer, XmlFormatting fmt ); ScopedElement( ScopedElement&& other ) noexcept; ScopedElement& operator=( ScopedElement&& other ) noexcept; ~ScopedElement(); - ScopedElement& writeText( std::string const& text, bool indent = true ); + ScopedElement& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent ); template ScopedElement& writeAttribute( std::string const& name, T const& attribute ) { @@ -6236,6 +6268,7 @@ namespace Catch { private: mutable XmlWriter* m_writer = nullptr; + XmlFormatting m_fmt; }; XmlWriter( std::ostream& os = Catch::cout() ); @@ -6244,11 +6277,11 @@ namespace Catch { XmlWriter( XmlWriter const& ) = delete; XmlWriter& operator=( XmlWriter const& ) = delete; - XmlWriter& startElement( std::string const& name ); + XmlWriter& startElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - ScopedElement scopedElement( std::string const& name ); + ScopedElement scopedElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - XmlWriter& endElement(); + XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); XmlWriter& writeAttribute( std::string const& name, std::string const& attribute ); @@ -6261,9 +6294,9 @@ namespace Catch { return writeAttribute( name, rss.str() ); } - XmlWriter& writeText( std::string const& text, bool indent = true ); + XmlWriter& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - XmlWriter& writeComment( std::string const& text ); + XmlWriter& writeComment(std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); void writeStylesheetRef( std::string const& url ); @@ -6273,6 +6306,8 @@ namespace Catch { private: + void applyFormatting(XmlFormatting fmt); + void writeDeclaration(); void newlineIfNecessary(); @@ -6316,9 +6351,10 @@ namespace Catch { void writeTestCase(TestCaseNode const& testCaseNode); - void writeSection(std::string const& className, - std::string const& rootName, - SectionNode const& sectionNode); + void writeSection( std::string const& className, + std::string const& rootName, + SectionNode const& sectionNode, + bool testOkToFail ); void writeAssertions(SectionNode const& sectionNode); void writeAssertion(AssertionStats const& stats); @@ -6394,6 +6430,12 @@ namespace Catch { #endif #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) +// start catch_benchmarking_all.hpp + +// A proxy header that includes all of the benchmarking headers to allow +// concise include of the benchmarking features. You should prefer the +// individual includes in standard use. + // start catch_benchmark.hpp // Benchmark @@ -6529,20 +6571,18 @@ namespace Catch { return {}; } }; - template - using ResultOf_t = typename std::result_of::type; // invoke and not return void :( template - CompleteType_t> complete_invoke(Fun&& fun, Args&&... args) { - return CompleteInvoker>::invoke(std::forward(fun), std::forward(args)...); + CompleteType_t> complete_invoke(Fun&& fun, Args&&... args) { + return CompleteInvoker>::invoke(std::forward(fun), std::forward(args)...); } const std::string benchmarkErrorMsg = "a benchmark failed to run successfully"; } // namespace Detail template - Detail::CompleteType_t> user_code(Fun&& fun) { + Detail::CompleteType_t> user_code(Fun&& fun) { CATCH_TRY{ return Detail::complete_invoke(std::forward(fun)); } CATCH_CATCH_ALL{ @@ -6787,8 +6827,8 @@ namespace Catch { Result result; int iterations; }; - template - using TimingOf = Timing, Detail::CompleteType_t>>; + template + using TimingOf = Timing, Detail::CompleteType_t>>; } // namespace Benchmark } // namespace Catch @@ -6799,7 +6839,7 @@ namespace Catch { namespace Benchmark { namespace Detail { template - TimingOf measure(Fun&& fun, Args&&... args) { + TimingOf measure(Fun&& fun, Args&&... args) { auto start = Clock::now(); auto&& r = Detail::complete_invoke(fun, std::forward(args)...); auto end = Clock::now(); @@ -6818,11 +6858,11 @@ namespace Catch { namespace Benchmark { namespace Detail { template - TimingOf measure_one(Fun&& fun, int iters, std::false_type) { + TimingOf measure_one(Fun&& fun, int iters, std::false_type) { return Detail::measure(fun, iters); } template - TimingOf measure_one(Fun&& fun, int iters, std::true_type) { + TimingOf measure_one(Fun&& fun, int iters, std::true_type) { Detail::ChronometerModel meter; auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters)); @@ -6839,7 +6879,7 @@ namespace Catch { }; template - TimingOf)> run_for_at_least(ClockDuration how_long, int seed, Fun&& fun) { + TimingOf> run_for_at_least(ClockDuration how_long, int seed, Fun&& fun) { auto iters = seed; while (iters < (1 << 30)) { auto&& Timing = measure_one(fun, iters, is_callable()); @@ -6849,7 +6889,7 @@ namespace Catch { } iters *= 2; } - throw optimized_away_error{}; + Catch::throw_exception(optimized_away_error{}); } } // namespace Detail } // namespace Benchmark @@ -6857,6 +6897,7 @@ namespace Catch { // end catch_run_for_at_least.hpp #include +#include namespace Catch { namespace Benchmark { @@ -6907,11 +6948,13 @@ namespace Catch { #include #include #include +#include #include #include #include #include #include +#include namespace Catch { namespace Benchmark { @@ -7025,8 +7068,8 @@ namespace Catch { double b2 = bias - z1; double a1 = a(b1); double a2 = a(b2); - auto lo = std::max(cumn(a1), 0); - auto hi = std::min(cumn(a2), n - 1); + auto lo = (std::max)(cumn(a1), 0); + auto hi = (std::min)(cumn(a2), n - 1); return { point, resample[lo], resample[hi], confidence_level }; } @@ -7095,7 +7138,9 @@ namespace Catch { } template EnvironmentEstimate> estimate_clock_cost(FloatDuration resolution) { - auto time_limit = std::min(resolution * clock_cost_estimation_tick_limit, FloatDuration(clock_cost_estimation_time_limit)); + auto time_limit = (std::min)( + resolution * clock_cost_estimation_tick_limit, + FloatDuration(clock_cost_estimation_time_limit)); auto time_clock = [](int k) { return Detail::measure([k] { for (int i = 0; i < k; ++i) { @@ -7261,10 +7306,10 @@ namespace Catch { template ExecutionPlan> prepare(const IConfig &cfg, Environment> env) const { auto min_time = env.clock_resolution.mean * Detail::minimum_ticks; - auto run_time = std::max(min_time, std::chrono::duration_cast(Detail::warmup_time)); + auto run_time = std::max(min_time, std::chrono::duration_cast(cfg.benchmarkWarmupTime())); auto&& test = Detail::run_for_at_least(std::chrono::duration_cast>(run_time), 1, fun); int new_iters = static_cast(std::ceil(min_time * test.iterations / test.elapsed)); - return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast>(Detail::warmup_time), Detail::warmup_iterations }; + return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations }; } template @@ -7296,7 +7341,7 @@ namespace Catch { }); auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end()); - BenchmarkStats> stats{ info, analysis.samples, analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance }; + BenchmarkStats> stats{ info, analysis.samples, analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance }; getResultCapture().benchmarkEnded(stats); } CATCH_CATCH_ALL{ @@ -7337,6 +7382,77 @@ namespace Catch { BenchmarkName = [&] // end catch_benchmark.hpp +// start catch_constructor.hpp + +// Constructor and destructor helpers + + +#include + +namespace Catch { + namespace Benchmark { + namespace Detail { + template + struct ObjectStorage + { + using TStorage = typename std::aligned_storage::value>::type; + + ObjectStorage() : data() {} + + ObjectStorage(const ObjectStorage& other) + { + new(&data) T(other.stored_object()); + } + + ObjectStorage(ObjectStorage&& other) + { + new(&data) T(std::move(other.stored_object())); + } + + ~ObjectStorage() { destruct_on_exit(); } + + template + void construct(Args&&... args) + { + new (&data) T(std::forward(args)...); + } + + template + typename std::enable_if::type destruct() + { + stored_object().~T(); + } + + private: + // If this is a constructor benchmark, destruct the underlying object + template + void destruct_on_exit(typename std::enable_if::type* = 0) { destruct(); } + // Otherwise, don't + template + void destruct_on_exit(typename std::enable_if::type* = 0) { } + + T& stored_object() { + return *static_cast(static_cast(&data)); + } + + T const& stored_object() const { + return *static_cast(static_cast(&data)); + } + + TStorage data; + }; + } + + template + using storage_for = Detail::ObjectStorage; + + template + using destructable_object = Detail::ObjectStorage; + } +} + +// end catch_constructor.hpp +// end catch_benchmarking_all.hpp #endif #endif // ! CATCH_CONFIG_IMPL_ONLY @@ -7364,23 +7480,37 @@ namespace TestCaseTracking { SourceLineInfo location; NameAndLocation( std::string const& _name, SourceLineInfo const& _location ); + friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) { + return lhs.name == rhs.name + && lhs.location == rhs.location; + } }; - struct ITracker; + class ITracker; using ITrackerPtr = std::shared_ptr; - struct ITracker { - virtual ~ITracker(); + class ITracker { + NameAndLocation m_nameAndLocation; + + public: + ITracker(NameAndLocation const& nameAndLoc) : + m_nameAndLocation(nameAndLoc) + {} // static queries - virtual NameAndLocation const& nameAndLocation() const = 0; + NameAndLocation const& nameAndLocation() const { + return m_nameAndLocation; + } + + virtual ~ITracker(); // dynamic queries virtual bool isComplete() const = 0; // Successfully completed or failed virtual bool isSuccessfullyCompleted() const = 0; virtual bool isOpen() const = 0; // Started but not complete virtual bool hasChildren() const = 0; + virtual bool hasStarted() const = 0; virtual ITracker& parent() = 0; @@ -7435,7 +7565,6 @@ namespace TestCaseTracking { }; using Children = std::vector; - NameAndLocation m_nameAndLocation; TrackerContext& m_ctx; ITracker* m_parent; Children m_children; @@ -7444,11 +7573,13 @@ namespace TestCaseTracking { public: TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ); - NameAndLocation const& nameAndLocation() const override; bool isComplete() const override; bool isSuccessfullyCompleted() const override; bool isOpen() const override; bool hasChildren() const override; + bool hasStarted() const override { + return m_runState != NotStarted; + } void addChild( ITrackerPtr const& child ) override; @@ -7487,6 +7618,10 @@ namespace TestCaseTracking { void addInitialFilters( std::vector const& filters ); void addNextFilters( std::vector const& filters ); + //! Returns filters active in this tracker + std::vector const& getFilters() const; + //! Returns whitespace-trimmed name of the tracked section + std::string const& trimmedName() const; }; } // namespace TestCaseTracking @@ -7652,7 +7787,7 @@ namespace Catch { double sb = stddev.point; double mn = mean.point / n; double mg_min = mn / 2.; - double sg = std::min(mg_min / 4., sb / std::sqrt(n)); + double sg = (std::min)(mg_min / 4., sb / std::sqrt(n)); double sg2 = sg * sg; double sb2 = sb * sb; @@ -7671,13 +7806,14 @@ namespace Catch { return (nc / n) * (sb2 - nc * sg2); }; - return std::min(var_out(1), var_out(std::min(c_max(0.), c_max(mg_min)))) / sb2; + return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) / sb2; } bootstrap_analysis analyse_samples(double confidence_level, int n_resamples, std::vector::iterator first, std::vector::iterator last) { + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS static std::random_device entropy; - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION auto n = static_cast(last - first); // seriously, one can't use integral types without hell in C++ @@ -7810,7 +7946,24 @@ namespace Catch { #ifdef CATCH_PLATFORM_MAC - #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */ + #if defined(__i386__) || defined(__x86_64__) + #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */ + #elif defined(__aarch64__) + #define CATCH_TRAP() __asm__(".inst 0xd4200000") + #endif + +#elif defined(CATCH_PLATFORM_IPHONE) + + // use inline assembler + #if defined(__i386__) || defined(__x86_64__) + #define CATCH_TRAP() __asm__("int $3") + #elif defined(__aarch64__) + #define CATCH_TRAP() __asm__(".inst 0xd4200000") + #elif defined(__arm__) && !defined(__thumb__) + #define CATCH_TRAP() __asm__(".inst 0xe7f001f0") + #elif defined(__arm__) && defined(__thumb__) + #define CATCH_TRAP() __asm__(".inst 0xde01") + #endif #elif defined(CATCH_PLATFORM_LINUX) // If we can use inline assembler, do it because this allows us to break @@ -7830,10 +7983,12 @@ namespace Catch { #define CATCH_TRAP() DebugBreak() #endif -#ifdef CATCH_TRAP - #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }() -#else - #define CATCH_BREAK_INTO_DEBUGGER() []{}() +#ifndef CATCH_BREAK_INTO_DEBUGGER + #ifdef CATCH_TRAP + #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }() + #else + #define CATCH_BREAK_INTO_DEBUGGER() []{}() + #endif #endif // end catch_debugger.h @@ -7841,86 +7996,58 @@ namespace Catch { // start catch_fatal_condition.h -// start catch_windows_h_proxy.h - - -#if defined(CATCH_PLATFORM_WINDOWS) - -#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX) -# define CATCH_DEFINED_NOMINMAX -# define NOMINMAX -#endif -#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN) -# define CATCH_DEFINED_WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -#endif - -#ifdef __AFXDLL -#include -#else -#include -#endif - -#ifdef CATCH_DEFINED_NOMINMAX -# undef NOMINMAX -#endif -#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN -# undef WIN32_LEAN_AND_MEAN -#endif - -#endif // defined(CATCH_PLATFORM_WINDOWS) - -// end catch_windows_h_proxy.h -#if defined( CATCH_CONFIG_WINDOWS_SEH ) +#include namespace Catch { - struct FatalConditionHandler { - - static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo); + // Wrapper for platform-specific fatal error (signals/SEH) handlers + // + // Tries to be cooperative with other handlers, and not step over + // other handlers. This means that unknown structured exceptions + // are passed on, previous signal handlers are called, and so on. + // + // Can only be instantiated once, and assumes that once a signal + // is caught, the binary will end up terminating. Thus, there + class FatalConditionHandler { + bool m_started = false; + + // Install/disengage implementation for specific platform. + // Should be if-defed to work on current platform, can assume + // engage-disengage 1:1 pairing. + void engage_platform(); + void disengage_platform(); + public: + // Should also have platform-specific implementations as needed FatalConditionHandler(); - static void reset(); ~FatalConditionHandler(); - private: - static bool isSet; - static ULONG guaranteeSize; - static PVOID exceptionHandlerHandle; - }; - -} // namespace Catch - -#elif defined ( CATCH_CONFIG_POSIX_SIGNALS ) - -#include - -namespace Catch { - - struct FatalConditionHandler { - - static bool isSet; - static struct sigaction oldSigActions[]; - static stack_t oldSigStack; - static char altStackMem[]; - - static void handleSignal( int sig ); + void engage() { + assert(!m_started && "Handler cannot be installed twice."); + m_started = true; + engage_platform(); + } - FatalConditionHandler(); - ~FatalConditionHandler(); - static void reset(); + void disengage() { + assert(m_started && "Handler cannot be uninstalled without being installed first"); + m_started = false; + disengage_platform(); + } }; -} // namespace Catch - -#else - -namespace Catch { - struct FatalConditionHandler { - void reset(); + //! Simple RAII guard for (dis)engaging the FatalConditionHandler + class FatalConditionHandlerGuard { + FatalConditionHandler* m_handler; + public: + FatalConditionHandlerGuard(FatalConditionHandler* handler): + m_handler(handler) { + m_handler->engage(); + } + ~FatalConditionHandlerGuard() { + m_handler->disengage(); + } }; -} -#endif +} // end namespace Catch // end catch_fatal_condition.h #include @@ -7980,7 +8107,7 @@ namespace Catch { void sectionEnded( SectionEndInfo const& endInfo ) override; void sectionEndedEarly( SectionEndInfo const& endInfo ) override; - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override; + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override; #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) void benchmarkPreparing( std::string const& name ) override; @@ -8046,6 +8173,7 @@ namespace Catch { std::vector m_unfinishedSections; std::vector m_activeSections; TrackerContext m_trackerContext; + FatalConditionHandler m_fatalConditionhandler; bool m_lastAssertionPassed = false; bool m_shouldReportUnexpected = true; bool m_includeSuccessfulResults; @@ -8956,7 +9084,7 @@ namespace detail { } inline auto convertInto( std::string const &source, bool &target ) -> ParserResult { std::string srcLC = source; - std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( char c ) { return static_cast( std::tolower(c) ); } ); + std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( unsigned char c ) { return static_cast( std::tolower(c) ); } ); if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on") target = true; else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off") @@ -9605,8 +9733,7 @@ namespace Catch { if( !startsWith( line, '"' ) ) line = '"' + line + '"'; config.testsOrTags.push_back( line ); - config.testsOrTags.push_back( "," ); - + config.testsOrTags.emplace_back( "," ); } } //Remove comma in the end @@ -9647,14 +9774,16 @@ namespace Catch { }; auto const setWaitForKeypress = [&]( std::string const& keypress ) { auto keypressLc = toLower( keypress ); - if( keypressLc == "start" ) + if (keypressLc == "never") + config.waitForKeypress = WaitForKeypress::Never; + else if( keypressLc == "start" ) config.waitForKeypress = WaitForKeypress::BeforeStart; else if( keypressLc == "exit" ) config.waitForKeypress = WaitForKeypress::BeforeExit; else if( keypressLc == "both" ) config.waitForKeypress = WaitForKeypress::BeforeStartAndExit; else - return ParserResult::runtimeError( "keypress argument must be one of: start, exit or both. '" + keypress + "' not recognised" ); + return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" ); return ParserResult::ok( ParseResultType::Matched ); }; auto const setVerbosity = [&]( std::string const& verbosity ) { @@ -9724,6 +9853,9 @@ namespace Catch { | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" ) ["-d"]["--durations"] ( "show test durations" ) + | Opt( config.minDuration, "seconds" ) + ["-D"]["--min-duration"] + ( "show test durations for tests taking at least the given number of seconds" ) | Opt( loadTestNamesFromFile, "filename" ) ["-f"]["--input-file"] ( "load test names to run from a file" ) @@ -9754,7 +9886,7 @@ namespace Catch { | Opt( config.libIdentify ) ["--libidentify"] ( "report name and version according to libidentify standard" ) - | Opt( setWaitForKeypress, "start|exit|both" ) + | Opt( setWaitForKeypress, "never|start|exit|both" ) ["--wait-for-keypress"] ( "waits for a keypress before exiting" ) | Opt( config.benchmarkSamples, "samples" ) @@ -9769,7 +9901,10 @@ namespace Catch { | Opt( config.benchmarkNoAnalysis ) ["--benchmark-no-analysis"] ( "perform only measurements; do not perform any analysis" ) - | Arg( config.testsOrTags, "test name|pattern|tags" ) + | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" ) + ["--benchmark-warmup-time"] + ( "amount of time in milliseconds spent on warming up each test (default: 100)" ) + | Arg( config.testsOrTags, "test name|pattern|tags" ) ( "which test or tests to use" ); return cli; @@ -9868,6 +10003,7 @@ namespace Catch { bool Config::warnAboutMissingAssertions() const { return !!(m_data.warnings & WarnAbout::NoAssertions); } bool Config::warnAboutNoTests() const { return !!(m_data.warnings & WarnAbout::NoTests); } ShowDurations::OrNot Config::showDurations() const { return m_data.showDurations; } + double Config::minDuration() const { return m_data.minDuration; } RunTests::InWhatOrder Config::runOrder() const { return m_data.runOrder; } unsigned int Config::rngSeed() const { return m_data.rngSeed; } UseColour::YesOrNo Config::useColour() const { return m_data.useColour; } @@ -9876,10 +10012,11 @@ namespace Catch { bool Config::showInvisibles() const { return m_data.showInvisibles; } Verbosity Config::verbosity() const { return m_data.verbosity; } - bool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; } - int Config::benchmarkSamples() const { return m_data.benchmarkSamples; } - double Config::benchmarkConfidenceInterval() const { return m_data.benchmarkConfidenceInterval; } - unsigned int Config::benchmarkResamples() const { return m_data.benchmarkResamples; } + bool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; } + int Config::benchmarkSamples() const { return m_data.benchmarkSamples; } + double Config::benchmarkConfidenceInterval() const { return m_data.benchmarkConfidenceInterval; } + unsigned int Config::benchmarkResamples() const { return m_data.benchmarkResamples; } + std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); } IStream const* Config::openStream() { return Catch::makeStream(m_data.outputFilename); @@ -9909,24 +10046,54 @@ namespace Catch { } // end catch_errno_guard.h -#include +// start catch_windows_h_proxy.h -namespace Catch { - namespace { - struct IColourImpl { - virtual ~IColourImpl() = default; - virtual void use( Colour::Code _colourCode ) = 0; - }; +#if defined(CATCH_PLATFORM_WINDOWS) - struct NoColourImpl : IColourImpl { - void use( Colour::Code ) {} +#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX) +# define CATCH_DEFINED_NOMINMAX +# define NOMINMAX +#endif +#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN) +# define CATCH_DEFINED_WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif - static IColourImpl* instance() { - static NoColourImpl s_instance; - return &s_instance; - } - }; +#ifdef __AFXDLL +#include +#else +#include +#endif + +#ifdef CATCH_DEFINED_NOMINMAX +# undef NOMINMAX +#endif +#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN +# undef WIN32_LEAN_AND_MEAN +#endif + +#endif // defined(CATCH_PLATFORM_WINDOWS) + +// end catch_windows_h_proxy.h +#include + +namespace Catch { + namespace { + + struct IColourImpl { + virtual ~IColourImpl() = default; + virtual void use( Colour::Code _colourCode ) = 0; + }; + + struct NoColourImpl : IColourImpl { + void use( Colour::Code ) override {} + + static IColourImpl* instance() { + static NoColourImpl s_instance; + return &s_instance; + } + }; } // anon namespace } // namespace Catch @@ -10052,7 +10219,7 @@ namespace { bool useColourOnPlatform() { return -#ifdef CATCH_PLATFORM_MAC +#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE) !isDebuggerActive() && #endif #if !(defined(__DJGPP__) && defined(__STRICT_ANSI__)) @@ -10093,13 +10260,13 @@ namespace Catch { namespace Catch { Colour::Colour( Code _colourCode ) { use( _colourCode ); } - Colour::Colour( Colour&& rhs ) noexcept { - m_moved = rhs.m_moved; - rhs.m_moved = true; + Colour::Colour( Colour&& other ) noexcept { + m_moved = other.m_moved; + other.m_moved = true; } - Colour& Colour::operator=( Colour&& rhs ) noexcept { - m_moved = rhs.m_moved; - rhs.m_moved = true; + Colour& Colour::operator=( Colour&& other ) noexcept { + m_moved = other.m_moved; + other.m_moved = true; return *this; } @@ -10111,7 +10278,7 @@ namespace Catch { // However, under some conditions it does happen (see #1626), // and this change is small enough that we can let practicality // triumph over purity in this case. - if (impl != NULL) { + if (impl != nullptr) { impl->use( _colourCode ); } } @@ -10229,10 +10396,9 @@ namespace Catch { // end catch_debug_console.cpp // start catch_debugger.cpp -#ifdef CATCH_PLATFORM_MAC +#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE) -# include -# include +# include # include # include # include @@ -10426,7 +10592,7 @@ namespace Catch { // Extracts the actual name part of an enum instance // In other words, it returns the Blue part of Bikeshed::Colour::Blue StringRef extractInstanceName(StringRef enumInstance) { - // Find last occurence of ":" + // Find last occurrence of ":" size_t name_start = enumInstance.size(); while (name_start > 0 && enumInstance[name_start - 1] != ':') { --name_start; @@ -10464,7 +10630,7 @@ namespace Catch { assert( valueNames.size() == values.size() ); std::size_t i = 0; for( auto value : values ) - enumInfo->m_values.push_back({ value, valueNames[i++] }); + enumInfo->m_values.emplace_back(value, valueNames[i++]); return enumInfo; } @@ -10588,25 +10754,47 @@ namespace Catch { // end catch_exception_translator_registry.cpp // start catch_fatal_condition.cpp -#if defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wmissing-field-initializers" -#endif +#include + +#if !defined( CATCH_CONFIG_WINDOWS_SEH ) && !defined( CATCH_CONFIG_POSIX_SIGNALS ) + +namespace Catch { + + // If neither SEH nor signal handling is required, the handler impls + // do not have to do anything, and can be empty. + void FatalConditionHandler::engage_platform() {} + void FatalConditionHandler::disengage_platform() {} + FatalConditionHandler::FatalConditionHandler() = default; + FatalConditionHandler::~FatalConditionHandler() = default; + +} // end namespace Catch + +#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS + +#if defined( CATCH_CONFIG_WINDOWS_SEH ) && defined( CATCH_CONFIG_POSIX_SIGNALS ) +#error "Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time" +#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS #if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS ) namespace { - // Report the error condition + //! Signals fatal error message to the run context void reportFatal( char const * const message ) { Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message ); } -} -#endif // signals/SEH handling + //! Minimal size Catch2 needs for its own fatal error handling. + //! Picked anecdotally, so it might not be sufficient on all + //! platforms, and for all configurations. + constexpr std::size_t minStackSizeForErrors = 32 * 1024; +} // end unnamed namespace + +#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS #if defined( CATCH_CONFIG_WINDOWS_SEH ) namespace Catch { + struct SignalDefs { DWORD id; const char* name; }; // There is no 1-1 mapping between signals and windows exceptions. @@ -10619,7 +10807,7 @@ namespace Catch { { static_cast(EXCEPTION_INT_DIVIDE_BY_ZERO), "Divide by zero error" }, }; - LONG CALLBACK FatalConditionHandler::handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) { + static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) { for (auto const& def : signalDefs) { if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) { reportFatal(def.name); @@ -10630,38 +10818,50 @@ namespace Catch { return EXCEPTION_CONTINUE_SEARCH; } + // Since we do not support multiple instantiations, we put these + // into global variables and rely on cleaning them up in outlined + // constructors/destructors + static PVOID exceptionHandlerHandle = nullptr; + + // For MSVC, we reserve part of the stack memory for handling + // memory overflow structured exception. FatalConditionHandler::FatalConditionHandler() { - isSet = true; - // 32k seems enough for Catch to handle stack overflow, - // but the value was found experimentally, so there is no strong guarantee - guaranteeSize = 32 * 1024; - exceptionHandlerHandle = nullptr; + ULONG guaranteeSize = static_cast(minStackSizeForErrors); + if (!SetThreadStackGuarantee(&guaranteeSize)) { + // We do not want to fully error out, because needing + // the stack reserve should be rare enough anyway. + Catch::cerr() + << "Failed to reserve piece of stack." + << " Stack overflows will not be reported successfully."; + } + } + + // We do not attempt to unset the stack guarantee, because + // Windows does not support lowering the stack size guarantee. + FatalConditionHandler::~FatalConditionHandler() = default; + + void FatalConditionHandler::engage_platform() { // Register as first handler in current chain exceptionHandlerHandle = AddVectoredExceptionHandler(1, handleVectoredException); - // Pass in guarantee size to be filled - SetThreadStackGuarantee(&guaranteeSize); + if (!exceptionHandlerHandle) { + CATCH_RUNTIME_ERROR("Could not register vectored exception handler"); + } } - void FatalConditionHandler::reset() { - if (isSet) { - RemoveVectoredExceptionHandler(exceptionHandlerHandle); - SetThreadStackGuarantee(&guaranteeSize); - exceptionHandlerHandle = nullptr; - isSet = false; + void FatalConditionHandler::disengage_platform() { + if (!RemoveVectoredExceptionHandler(exceptionHandlerHandle)) { + CATCH_RUNTIME_ERROR("Could not unregister vectored exception handler"); } + exceptionHandlerHandle = nullptr; } - FatalConditionHandler::~FatalConditionHandler() { - reset(); - } +} // end namespace Catch -bool FatalConditionHandler::isSet = false; -ULONG FatalConditionHandler::guaranteeSize = 0; -PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr; +#endif // CATCH_CONFIG_WINDOWS_SEH -} // namespace Catch +#if defined( CATCH_CONFIG_POSIX_SIGNALS ) -#elif defined( CATCH_CONFIG_POSIX_SIGNALS ) +#include namespace Catch { @@ -10670,10 +10870,6 @@ namespace Catch { const char* name; }; - // 32kb for the alternate stack seems to be sufficient. However, this value - // is experimentally determined, so that's not guaranteed. - static constexpr std::size_t sigStackSize = 32768 >= MINSIGSTKSZ ? 32768 : MINSIGSTKSZ; - static SignalDefs signalDefs[] = { { SIGINT, "SIGINT - Terminal interrupt signal" }, { SIGILL, "SIGILL - Illegal instruction signal" }, @@ -10683,7 +10879,32 @@ namespace Catch { { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" } }; - void FatalConditionHandler::handleSignal( int sig ) { +// Older GCCs trigger -Wmissing-field-initializers for T foo = {} +// which is zero initialization, but not explicit. We want to avoid +// that. +#if defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#endif + + static char* altStackMem = nullptr; + static std::size_t altStackSize = 0; + static stack_t oldSigStack{}; + static struct sigaction oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{}; + + static void restorePreviousSignalHandlers() { + // We set signal handlers back to the previous ones. Hopefully + // nobody overwrote them in the meantime, and doesn't expect + // their signal handlers to live past ours given that they + // installed them after ours.. + for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) { + sigaction(signalDefs[i].id, &oldSigActions[i], nullptr); + } + // Return the old stack + sigaltstack(&oldSigStack, nullptr); + } + + static void handleSignal( int sig ) { char const * name = ""; for (auto const& def : signalDefs) { if (sig == def.id) { @@ -10691,16 +10912,33 @@ namespace Catch { break; } } - reset(); - reportFatal(name); + // We need to restore previous signal handlers and let them do + // their thing, so that the users can have the debugger break + // when a signal is raised, and so on. + restorePreviousSignalHandlers(); + reportFatal( name ); raise( sig ); } FatalConditionHandler::FatalConditionHandler() { - isSet = true; + assert(!altStackMem && "Cannot initialize POSIX signal handler when one already exists"); + if (altStackSize == 0) { + altStackSize = std::max(static_cast(SIGSTKSZ), minStackSizeForErrors); + } + altStackMem = new char[altStackSize](); + } + + FatalConditionHandler::~FatalConditionHandler() { + delete[] altStackMem; + // We signal that another instance can be constructed by zeroing + // out the pointer. + altStackMem = nullptr; + } + + void FatalConditionHandler::engage_platform() { stack_t sigStack; sigStack.ss_sp = altStackMem; - sigStack.ss_size = sigStackSize; + sigStack.ss_size = altStackSize; sigStack.ss_flags = 0; sigaltstack(&sigStack, &oldSigStack); struct sigaction sa = { }; @@ -10712,40 +10950,17 @@ namespace Catch { } } - FatalConditionHandler::~FatalConditionHandler() { - reset(); - } +#if defined(__GNUC__) +# pragma GCC diagnostic pop +#endif - void FatalConditionHandler::reset() { - if( isSet ) { - // Set signals back to previous values -- hopefully nobody overwrote them in the meantime - for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i ) { - sigaction(signalDefs[i].id, &oldSigActions[i], nullptr); - } - // Return the old stack - sigaltstack(&oldSigStack, nullptr); - isSet = false; - } + void FatalConditionHandler::disengage_platform() { + restorePreviousSignalHandlers(); } - bool FatalConditionHandler::isSet = false; - struct sigaction FatalConditionHandler::oldSigActions[sizeof(signalDefs)/sizeof(SignalDefs)] = {}; - stack_t FatalConditionHandler::oldSigStack = {}; - char FatalConditionHandler::altStackMem[sigStackSize] = {}; - -} // namespace Catch - -#else - -namespace Catch { - void FatalConditionHandler::reset() {} -} - -#endif // signals/SEH handling +} // end namespace Catch -#if defined(__GNUC__) -# pragma GCC diagnostic pop -#endif +#endif // CATCH_CONFIG_POSIX_SIGNALS // end catch_fatal_condition.cpp // start catch_generators.cpp @@ -10764,8 +10979,8 @@ namespace Generators { GeneratorUntypedBase::~GeneratorUntypedBase() {} - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { - return getResultCapture().acquireGeneratorTracker( lineInfo ); + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { + return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo ); } } // namespace Generators @@ -11040,7 +11255,7 @@ namespace Catch { namespace Catch { std::size_t listTests( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); if( config.hasTestFilters() ) Catch::cout() << "Matching test cases:\n"; else { @@ -11074,7 +11289,7 @@ namespace Catch { } std::size_t listTestsNamesOnly( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); std::size_t matchedTests = 0; std::vector matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config ); for( auto const& testCaseInfo : matchedTestCases ) { @@ -11112,7 +11327,7 @@ namespace Catch { } std::size_t listTags( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); if( config.hasTestFilters() ) Catch::cout() << "Tags for matching test cases:\n"; else { @@ -11300,20 +11515,13 @@ namespace { return lhs == rhs; } - auto ulpDiff = std::abs(lc - rc); + // static cast as a workaround for IBM XLC + auto ulpDiff = std::abs(static_cast(lc - rc)); return static_cast(ulpDiff) <= maxUlpDiff; } -} //end anonymous namespace - #if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) -#if defined(__clang__) -#pragma clang diagnostic push -// The long double overload is currently unused -#pragma clang diagnostic ignored "-Wunused-function" -#endif - float nextafter(float x, float y) { return ::nextafterf(x, y); } @@ -11322,18 +11530,8 @@ namespace { return ::nextafter(x, y); } - long double nextafter(long double x, long double y) { - return ::nextafterl(x, y); - } - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - #endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^ -namespace { - template FP step(FP start, FP direction, uint64_t steps) { for (uint64_t i = 0; i < steps; ++i) { @@ -11431,9 +11629,10 @@ namespace Floating { ret << ", "; write(ret, step(m_target, static_cast( INFINITY), m_ulps)); } else { - write(ret, step(static_cast(m_target), -INFINITY, m_ulps)); + // We have to cast INFINITY to float because of MinGW, see #1782 + write(ret, step(static_cast(m_target), static_cast(-INFINITY), m_ulps)); ret << ", "; - write(ret, step(static_cast(m_target), INFINITY, m_ulps)); + write(ret, step(static_cast(m_target), static_cast( INFINITY), m_ulps)); } ret << "])"; @@ -11491,7 +11690,6 @@ Floating::WithinRelMatcher WithinRel(float target) { } // namespace Matchers } // namespace Catch - // end catch_matchers_floating.cpp // start catch_matchers_generic.cpp @@ -11669,10 +11867,10 @@ namespace Catch { Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) { auto trimmed = [&] (size_t start, size_t end) { - while (names[start] == ',' || isspace(names[start])) { + while (names[start] == ',' || isspace(static_cast(names[start]))) { ++start; } - while (names[end] == ',' || isspace(names[end])) { + while (names[end] == ',' || isspace(static_cast(names[end]))) { --end; } return names.substr(start, end - start + 1); @@ -11711,7 +11909,7 @@ namespace Catch { pos = skipq(pos, c); break; case ',': - if (start != pos && openings.size() == 0) { + if (start != pos && openings.empty()) { m_messages.emplace_back(macroName, lineInfo, resultType); m_messages.back().message = static_cast(trimmed(start, pos)); m_messages.back().message += " := "; @@ -11719,7 +11917,7 @@ namespace Catch { } } } - assert(openings.size() == 0 && "Mismatched openings"); + assert(openings.empty() && "Mismatched openings"); m_messages.emplace_back(macroName, lineInfo, resultType); m_messages.back().message = static_cast(trimmed(start, names.size() - 1)); m_messages.back().message += " := "; @@ -11907,7 +12105,7 @@ namespace Catch { if (tmpnam_s(m_buffer)) { CATCH_RUNTIME_ERROR("Could not get a temp filename"); } - if (fopen_s(&m_file, m_buffer, "w")) { + if (fopen_s(&m_file, m_buffer, "w+")) { char buffer[100]; if (strerror_s(buffer, errno)) { CATCH_RUNTIME_ERROR("Could not translate errno to a string"); @@ -12202,11 +12400,13 @@ namespace Catch { namespace Catch { class StartupExceptionRegistry { +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) public: void add(std::exception_ptr const& exception) noexcept; std::vector const& getExceptions() const noexcept; private: std::vector m_exceptions; +#endif }; } // end namespace Catch @@ -12289,7 +12489,11 @@ namespace Catch { m_tagAliasRegistry.add( alias, tag, lineInfo ); } void registerStartupException() noexcept override { +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) m_exceptionRegistry.add(std::current_exception()); +#else + CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!"); +#endif } IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override { return m_enumValuesRegistry; @@ -12393,17 +12597,32 @@ namespace Catch { std::shared_ptr tracker; ITracker& currentTracker = ctx.currentTracker(); - if( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) { + // Under specific circumstances, the generator we want + // to acquire is also the current tracker. If this is + // the case, we have to avoid looking through current + // tracker's children, and instead return the current + // tracker. + // A case where this check is important is e.g. + // for (int i = 0; i < 5; ++i) { + // int n = GENERATE(1, 2); + // } + // + // without it, the code above creates 5 nested generators. + if (currentTracker.nameAndLocation() == nameAndLocation) { + auto thisTracker = currentTracker.parent().findChild(nameAndLocation); + assert(thisTracker); + assert(thisTracker->isGeneratorTracker()); + tracker = std::static_pointer_cast(thisTracker); + } else if ( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) { assert( childTracker ); assert( childTracker->isGeneratorTracker() ); tracker = std::static_pointer_cast( childTracker ); - } - else { + } else { tracker = std::make_shared( nameAndLocation, ctx, ¤tTracker ); currentTracker.addChild( tracker ); } - if( !ctx.completedCycle() && !tracker->isComplete() ) { + if( !tracker->isComplete() ) { tracker->open(); } @@ -12417,8 +12636,68 @@ namespace Catch { } void close() override { TrackerBase::close(); - // Generator interface only finds out if it has another item on atual move - if (m_runState == CompletedSuccessfully && m_generator->next()) { + // If a generator has a child (it is followed by a section) + // and none of its children have started, then we must wait + // until later to start consuming its values. + // This catches cases where `GENERATE` is placed between two + // `SECTION`s. + // **The check for m_children.empty cannot be removed**. + // doing so would break `GENERATE` _not_ followed by `SECTION`s. + const bool should_wait_for_child = [&]() { + // No children -> nobody to wait for + if ( m_children.empty() ) { + return false; + } + // If at least one child started executing, don't wait + if ( std::find_if( + m_children.begin(), + m_children.end(), + []( TestCaseTracking::ITrackerPtr tracker ) { + return tracker->hasStarted(); + } ) != m_children.end() ) { + return false; + } + + // No children have started. We need to check if they _can_ + // start, and thus we should wait for them, or they cannot + // start (due to filters), and we shouldn't wait for them + auto* parent = m_parent; + // This is safe: there is always at least one section + // tracker in a test case tracking tree + while ( !parent->isSectionTracker() ) { + parent = &( parent->parent() ); + } + assert( parent && + "Missing root (test case) level section" ); + + auto const& parentSection = + static_cast( *parent ); + auto const& filters = parentSection.getFilters(); + // No filters -> no restrictions on running sections + if ( filters.empty() ) { + return true; + } + + for ( auto const& child : m_children ) { + if ( child->isSectionTracker() && + std::find( filters.begin(), + filters.end(), + static_cast( *child ) + .trimmedName() ) != + filters.end() ) { + return true; + } + } + return false; + }(); + + // This check is a bit tricky, because m_generator->next() + // has a side-effect, where it consumes generator's current + // value, but we do not want to invoke the side-effect if + // this generator is still waiting for any child to start. + if ( should_wait_for_child || + ( m_runState == CompletedSuccessfully && + m_generator->next() ) ) { m_children.clear(); m_runState = Executing; } @@ -12554,10 +12833,10 @@ namespace Catch { return true; } - auto RunContext::acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { + auto RunContext::acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { using namespace Generators; - GeneratorTracker& tracker = GeneratorTracker::acquire( m_trackerContext, TestCaseTracking::NameAndLocation( "generator", lineInfo ) ); - assert( tracker.isOpen() ); + GeneratorTracker& tracker = GeneratorTracker::acquire(m_trackerContext, + TestCaseTracking::NameAndLocation( static_cast(generatorName), lineInfo ) ); m_lastAssertionInfo.lineInfo = lineInfo; return tracker; } @@ -12600,17 +12879,17 @@ namespace Catch { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) void RunContext::benchmarkPreparing(std::string const& name) { - m_reporter->benchmarkPreparing(name); - } + m_reporter->benchmarkPreparing(name); + } void RunContext::benchmarkStarting( BenchmarkInfo const& info ) { m_reporter->benchmarkStarting( info ); } void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) { m_reporter->benchmarkEnded( stats ); } - void RunContext::benchmarkFailed(std::string const & error) { - m_reporter->benchmarkFailed(error); - } + void RunContext::benchmarkFailed(std::string const & error) { + m_reporter->benchmarkFailed(error); + } #endif // CATCH_CONFIG_ENABLE_BENCHMARKING void RunContext::pushScopedMessage(MessageInfo const & message) { @@ -12744,9 +13023,8 @@ namespace Catch { } void RunContext::invokeActiveTestCase() { - FatalConditionHandler fatalConditionHandler; // Handle signals + FatalConditionHandlerGuard _(&m_fatalConditionhandler); m_activeTestCase->invoke(); - fatalConditionHandler.reset(); } void RunContext::handleUnfinishedSections() { @@ -13114,6 +13392,10 @@ namespace Catch { filename.erase(0, lastSlash); filename[0] = '#'; } + else + { + filename.insert(0, "#"); + } auto lastDot = filename.find_last_of('.'); if (lastDot != std::string::npos) { @@ -13207,11 +13489,11 @@ namespace Catch { char **utf8Argv = new char *[ argc ]; for ( int i = 0; i < argc; ++i ) { - int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, NULL, 0, NULL, NULL ); + int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr ); utf8Argv[ i ] = new char[ bufSize ]; - WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, NULL, NULL ); + WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr ); } int returnCode = applyCommandLine( argc, utf8Argv ); @@ -13331,6 +13613,7 @@ namespace Catch { // end catch_singletons.cpp // start catch_startup_exception_registry.cpp +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) namespace Catch { void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept { CATCH_TRY { @@ -13346,6 +13629,7 @@ void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexce } } // end namespace Catch +#endif // end catch_startup_exception_registry.cpp // start catch_stream.cpp @@ -13530,7 +13814,7 @@ namespace Catch { namespace { char toLowerCh(char c) { - return static_cast( std::tolower( c ) ); + return static_cast( std::tolower( static_cast(c) ) ); } } @@ -13622,11 +13906,7 @@ namespace Catch { // end catch_string_manip.cpp // start catch_stringref.cpp -#if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wexit-time-destructors" -#endif - +#include #include #include #include @@ -13636,66 +13916,36 @@ namespace Catch { : StringRef( rawChars, static_cast(std::strlen(rawChars) ) ) {} - void StringRef::swap( StringRef& other ) noexcept { - std::swap( m_start, other.m_start ); - std::swap( m_size, other.m_size ); - std::swap( m_data, other.m_data ); - } - auto StringRef::c_str() const -> char const* { - if( !isSubstring() ) - return m_start; - - const_cast( this )->takeOwnership(); - return m_data; - } - auto StringRef::currentData() const noexcept -> char const* { + CATCH_ENFORCE(isNullTerminated(), "Called StringRef::c_str() on a non-null-terminated instance"); return m_start; } - - auto StringRef::isOwned() const noexcept -> bool { - return m_data != nullptr; - } - auto StringRef::isSubstring() const noexcept -> bool { - return m_start[m_size] != '\0'; + auto StringRef::data() const noexcept -> char const* { + return m_start; } - void StringRef::takeOwnership() { - if( !isOwned() ) { - m_data = new char[m_size+1]; - memcpy( m_data, m_start, m_size ); - m_data[m_size] = '\0'; - } - } auto StringRef::substr( size_type start, size_type size ) const noexcept -> StringRef { - if( start < m_size ) - return StringRef( m_start+start, size ); - else + if (start < m_size) { + return StringRef(m_start + start, (std::min)(m_size - start, size)); + } else { return StringRef(); + } } auto StringRef::operator == ( StringRef const& other ) const noexcept -> bool { - return - size() == other.size() && - (std::strncmp( m_start, other.m_start, size() ) == 0); - } - auto StringRef::operator != ( StringRef const& other ) const noexcept -> bool { - return !operator==( other ); + return m_size == other.m_size + && (std::memcmp( m_start, other.m_start, m_size ) == 0); } auto operator << ( std::ostream& os, StringRef const& str ) -> std::ostream& { - return os.write(str.currentData(), str.size()); + return os.write(str.data(), str.size()); } auto operator+=( std::string& lhs, StringRef const& rhs ) -> std::string& { - lhs.append(rhs.currentData(), rhs.size()); + lhs.append(rhs.data(), rhs.size()); return lhs; } } // namespace Catch - -#if defined(__clang__) -# pragma clang diagnostic pop -#endif // end catch_stringref.cpp // start catch_tag_alias.cpp @@ -13844,7 +14094,8 @@ namespace Catch { } } if( isHidden ) { - tags.push_back( "." ); + // Add all "hidden" tags to make them behave identically + tags.insert( tags.end(), { ".", "!hide" } ); } TestCaseInfo info( static_cast(nameAndTags.name), _className, desc, tags, _lineInfo ); @@ -13939,27 +14190,81 @@ namespace Catch { // end catch_test_case_info.cpp // start catch_test_case_registry_impl.cpp +#include #include namespace Catch { - std::vector sortTests( IConfig const& config, std::vector const& unsortedTestCases ) { + namespace { + struct TestHasher { + using hash_t = uint64_t; + + explicit TestHasher( hash_t hashSuffix ): + m_hashSuffix{ hashSuffix } {} + + uint32_t operator()( TestCase const& t ) const { + // FNV-1a hash with multiplication fold. + const hash_t prime = 1099511628211u; + hash_t hash = 14695981039346656037u; + for ( const char c : t.name ) { + hash ^= c; + hash *= prime; + } + hash ^= m_hashSuffix; + hash *= prime; + const uint32_t low{ static_cast( hash ) }; + const uint32_t high{ static_cast( hash >> 32 ) }; + return low * high; + } - std::vector sorted = unsortedTestCases; + private: + hash_t m_hashSuffix; + }; + } // end unnamed namespace + std::vector sortTests( IConfig const& config, std::vector const& unsortedTestCases ) { switch( config.runOrder() ) { - case RunTests::InLexicographicalOrder: - std::sort( sorted.begin(), sorted.end() ); - break; - case RunTests::InRandomOrder: - seedRng( config ); - std::shuffle( sorted.begin(), sorted.end(), rng() ); - break; case RunTests::InDeclarationOrder: // already in declaration order break; + + case RunTests::InLexicographicalOrder: { + std::vector sorted = unsortedTestCases; + std::sort( sorted.begin(), sorted.end() ); + return sorted; + } + + case RunTests::InRandomOrder: { + seedRng( config ); + TestHasher h{ config.rngSeed() }; + + using hashedTest = std::pair; + std::vector indexed_tests; + indexed_tests.reserve( unsortedTestCases.size() ); + + for (auto const& testCase : unsortedTestCases) { + indexed_tests.emplace_back(h(testCase), &testCase); + } + + std::sort(indexed_tests.begin(), indexed_tests.end(), + [](hashedTest const& lhs, hashedTest const& rhs) { + if (lhs.first == rhs.first) { + return lhs.second->name < rhs.second->name; + } + return lhs.first < rhs.first; + }); + + std::vector sorted; + sorted.reserve( indexed_tests.size() ); + + for (auto const& hashed : indexed_tests) { + sorted.emplace_back(*hashed.second); + } + + return sorted; + } } - return sorted; + return unsortedTestCases; } bool isThrowSafe( TestCase const& testCase, IConfig const& config ) { @@ -14096,15 +14401,12 @@ namespace TestCaseTracking { m_currentTracker = tracker; } - TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ) - : m_nameAndLocation( nameAndLocation ), + TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ): + ITracker(nameAndLocation), m_ctx( ctx ), m_parent( parent ) {} - NameAndLocation const& TrackerBase::nameAndLocation() const { - return m_nameAndLocation; - } bool TrackerBase::isComplete() const { return m_runState == CompletedSuccessfully || m_runState == Failed; } @@ -14220,7 +14522,8 @@ namespace TestCaseTracking { bool SectionTracker::isComplete() const { bool complete = true; - if ((m_filters.empty() || m_filters[0] == "") + if (m_filters.empty() + || m_filters[0] == "" || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) { complete = TrackerBase::isComplete(); } @@ -14255,8 +14558,8 @@ namespace TestCaseTracking { void SectionTracker::addInitialFilters( std::vector const& filters ) { if( !filters.empty() ) { m_filters.reserve( m_filters.size() + filters.size() + 2 ); - m_filters.push_back(""); // Root - should never be consulted - m_filters.push_back(""); // Test Case - not a section filter + m_filters.emplace_back(""); // Root - should never be consulted + m_filters.emplace_back(""); // Test Case - not a section filter m_filters.insert( m_filters.end(), filters.begin(), filters.end() ); } } @@ -14265,6 +14568,14 @@ namespace TestCaseTracking { m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() ); } + std::vector const& SectionTracker::getFilters() const { + return m_filters; + } + + std::string const& SectionTracker::trimmedName() const { + return m_trimmed_name; + } + } // namespace TestCaseTracking using TestCaseTracking::ITracker; @@ -14498,9 +14809,9 @@ namespace Catch { switch( m_mode ) { case Name: case QuotedName: - return addPattern(); + return addNamePattern(); case Tag: - return addPattern(); + return addTagPattern(); case EscapedName: revertBackToLastMode(); return; @@ -14553,6 +14864,7 @@ namespace Catch { m_pos = m_arg.size(); m_substring.clear(); m_patternName.clear(); + m_realPatternPos = 0; return false; } endMode(); @@ -14560,6 +14872,63 @@ namespace Catch { return true; //success } + std::string TestSpecParser::preprocessPattern() { + std::string token = m_patternName; + for (std::size_t i = 0; i < m_escapeChars.size(); ++i) + token = token.substr(0, m_escapeChars[i] - i) + token.substr(m_escapeChars[i] - i + 1); + m_escapeChars.clear(); + if (startsWith(token, "exclude:")) { + m_exclusion = true; + token = token.substr(8); + } + + m_patternName.clear(); + m_realPatternPos = 0; + + return token; + } + + void TestSpecParser::addNamePattern() { + auto token = preprocessPattern(); + + if (!token.empty()) { + TestSpec::PatternPtr pattern = std::make_shared(token, m_substring); + if (m_exclusion) + pattern = std::make_shared(pattern); + m_currentFilter.m_patterns.push_back(pattern); + } + m_substring.clear(); + m_exclusion = false; + m_mode = None; + } + + void TestSpecParser::addTagPattern() { + auto token = preprocessPattern(); + + if (!token.empty()) { + // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo]) + // we have to create a separate hide tag and shorten the real one + if (token.size() > 1 && token[0] == '.') { + token.erase(token.begin()); + TestSpec::PatternPtr pattern = std::make_shared(".", m_substring); + if (m_exclusion) { + pattern = std::make_shared(pattern); + } + m_currentFilter.m_patterns.push_back(pattern); + } + + TestSpec::PatternPtr pattern = std::make_shared(token, m_substring); + + if (m_exclusion) { + pattern = std::make_shared(pattern); + } + m_currentFilter.m_patterns.push_back(pattern); + } + m_substring.clear(); + m_exclusion = false; + m_mode = None; + } + TestSpec parseTestSpec( std::string const& arg ) { return TestSpecParser( ITagAliasRegistry::get() ).parse( arg ).testSpec(); } @@ -14661,13 +15030,11 @@ namespace Detail { enum Arch { Big, Little }; static Arch which() { - union _{ - int asInt; - char asChar[sizeof (int)]; - } u; - - u.asInt = 1; - return ( u.asChar[sizeof(int)-1] == 1 ) ? Big : Little; + int one = 1; + // If the lowest byte we read is non-zero, we can assume + // that little endian format is used. + auto value = *reinterpret_cast(&one); + return value ? Little : Big; } }; } @@ -14943,11 +15310,48 @@ namespace Catch { // end catch_totals.cpp // start catch_uncaught_exceptions.cpp +// start catch_config_uncaught_exceptions.hpp + +// Copyright Catch2 Authors +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// https://www.boost.org/LICENSE_1_0.txt) + +// SPDX-License-Identifier: BSL-1.0 + +#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP +#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP + +#if defined(_MSC_VER) +# if _MSC_VER >= 1900 // Visual Studio 2015 or newer +# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +# endif +#endif + +#include + +#if defined(__cpp_lib_uncaught_exceptions) \ + && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) + +# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +#endif // __cpp_lib_uncaught_exceptions + +#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \ + && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \ + && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) + +# define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +#endif + +#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP +// end catch_config_uncaught_exceptions.hpp #include namespace Catch { bool uncaught_exceptions() { -#if defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) +#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) + return false; +#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) return std::uncaught_exceptions() > 0; #else return std::uncaught_exception(); @@ -14987,7 +15391,7 @@ namespace Catch { } Version const& libraryVersion() { - static Version version( 2, 10, 1, "", 0 ); + static Version version( 2, 13, 9, "", 0 ); return version; } @@ -15035,8 +15439,7 @@ namespace Catch { // start catch_xmlwriter.cpp #include - -using uchar = unsigned char; +#include namespace Catch { @@ -15076,8 +15479,30 @@ namespace { os.flags(f); } + bool shouldNewline(XmlFormatting fmt) { + return !!(static_cast::type>(fmt & XmlFormatting::Newline)); + } + + bool shouldIndent(XmlFormatting fmt) { + return !!(static_cast::type>(fmt & XmlFormatting::Indent)); + } + } // anonymous namespace + XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs) { + return static_cast( + static_cast::type>(lhs) | + static_cast::type>(rhs) + ); + } + + XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs) { + return static_cast( + static_cast::type>(lhs) & + static_cast::type>(rhs) + ); + } + XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat ) : m_str( str ), m_forWhat( forWhat ) @@ -15088,7 +15513,7 @@ namespace { // (see: http://www.w3.org/TR/xml/#syntax) for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) { - uchar c = m_str[idx]; + unsigned char c = m_str[idx]; switch (c) { case '<': os << "<"; break; case '&': os << "&"; break; @@ -15148,7 +15573,7 @@ namespace { bool valid = true; uint32_t value = headerValue(c); for (std::size_t n = 1; n < encBytes; ++n) { - uchar nc = m_str[idx + n]; + unsigned char nc = m_str[idx + n]; valid &= ((nc & 0xC0) == 0x80); value = (value << 6) | (nc & 0x3F); } @@ -15182,13 +15607,17 @@ namespace { return os; } - XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer ) - : m_writer( writer ) + XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer, XmlFormatting fmt ) + : m_writer( writer ), + m_fmt(fmt) {} XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept - : m_writer( other.m_writer ){ + : m_writer( other.m_writer ), + m_fmt(other.m_fmt) + { other.m_writer = nullptr; + other.m_fmt = XmlFormatting::None; } XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept { if ( m_writer ) { @@ -15196,16 +15625,19 @@ namespace { } m_writer = other.m_writer; other.m_writer = nullptr; + m_fmt = other.m_fmt; + other.m_fmt = XmlFormatting::None; return *this; } XmlWriter::ScopedElement::~ScopedElement() { - if( m_writer ) - m_writer->endElement(); + if (m_writer) { + m_writer->endElement(m_fmt); + } } - XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, bool indent ) { - m_writer->writeText( text, indent ); + XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, XmlFormatting fmt ) { + m_writer->writeText( text, fmt ); return *this; } @@ -15215,37 +15647,47 @@ namespace { } XmlWriter::~XmlWriter() { - while( !m_tags.empty() ) + while (!m_tags.empty()) { endElement(); + } + newlineIfNecessary(); } - XmlWriter& XmlWriter::startElement( std::string const& name ) { + XmlWriter& XmlWriter::startElement( std::string const& name, XmlFormatting fmt ) { ensureTagClosed(); newlineIfNecessary(); - m_os << m_indent << '<' << name; + if (shouldIndent(fmt)) { + m_os << m_indent; + m_indent += " "; + } + m_os << '<' << name; m_tags.push_back( name ); - m_indent += " "; m_tagIsOpen = true; + applyFormatting(fmt); return *this; } - XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name ) { - ScopedElement scoped( this ); - startElement( name ); + XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name, XmlFormatting fmt ) { + ScopedElement scoped( this, fmt ); + startElement( name, fmt ); return scoped; } - XmlWriter& XmlWriter::endElement() { - newlineIfNecessary(); - m_indent = m_indent.substr( 0, m_indent.size()-2 ); + XmlWriter& XmlWriter::endElement(XmlFormatting fmt) { + m_indent = m_indent.substr(0, m_indent.size() - 2); + if( m_tagIsOpen ) { m_os << "/>"; m_tagIsOpen = false; + } else { + newlineIfNecessary(); + if (shouldIndent(fmt)) { + m_os << m_indent; + } + m_os << ""; } - else { - m_os << m_indent << ""; - } - m_os << std::endl; + m_os << std::flush; + applyFormatting(fmt); m_tags.pop_back(); return *this; } @@ -15261,22 +15703,26 @@ namespace { return *this; } - XmlWriter& XmlWriter::writeText( std::string const& text, bool indent ) { + XmlWriter& XmlWriter::writeText( std::string const& text, XmlFormatting fmt) { if( !text.empty() ){ bool tagWasOpen = m_tagIsOpen; ensureTagClosed(); - if( tagWasOpen && indent ) + if (tagWasOpen && shouldIndent(fmt)) { m_os << m_indent; + } m_os << XmlEncode( text ); - m_needsNewline = true; + applyFormatting(fmt); } return *this; } - XmlWriter& XmlWriter::writeComment( std::string const& text ) { + XmlWriter& XmlWriter::writeComment( std::string const& text, XmlFormatting fmt) { ensureTagClosed(); - m_os << m_indent << ""; - m_needsNewline = true; + if (shouldIndent(fmt)) { + m_os << m_indent; + } + m_os << ""; + applyFormatting(fmt); return *this; } @@ -15292,11 +15738,16 @@ namespace { void XmlWriter::ensureTagClosed() { if( m_tagIsOpen ) { - m_os << ">" << std::endl; + m_os << '>' << std::flush; + newlineIfNecessary(); m_tagIsOpen = false; } } + void XmlWriter::applyFormatting(XmlFormatting fmt) { + m_needsNewline = shouldNewline(fmt); + } + void XmlWriter::writeDeclaration() { m_os << "\n"; } @@ -15342,6 +15793,17 @@ namespace Catch { return std::string(buffer); } + bool shouldShowDuration( IConfig const& config, double duration ) { + if ( config.showDurations() == ShowDurations::Always ) { + return true; + } + if ( config.showDurations() == ShowDurations::Never ) { + return false; + } + const double min = config.minDuration(); + return min >= 0 && duration >= min; + } + std::string serializeFilters( std::vector const& container ) { ReusableStringStream oss; bool first = true; @@ -15608,10 +16070,6 @@ class AssertionPrinter { return "Reports test results on a single line, suitable for IDEs"; } - ReporterPreferences CompactReporter::getPreferences() const { - return m_reporterPrefs; - } - void CompactReporter::noMatchingTestCases( std::string const& spec ) { stream << "No test cases matched '" << spec << '\'' << std::endl; } @@ -15638,8 +16096,9 @@ class AssertionPrinter { } void CompactReporter::sectionEnded(SectionStats const& _sectionStats) { - if (m_config->showDurations() == ShowDurations::Always) { - stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl; + double dur = _sectionStats.durationInSeconds; + if ( shouldShowDuration( *m_config, dur ) ) { + stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << std::endl; } } @@ -15851,15 +16310,11 @@ class Duration { static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond; static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond; - uint64_t m_inNanoseconds; + double m_inNanoseconds; Unit m_units; public: - explicit Duration(double inNanoseconds, Unit units = Unit::Auto) - : Duration(static_cast(inNanoseconds), units) { - } - - explicit Duration(uint64_t inNanoseconds, Unit units = Unit::Auto) + explicit Duration(double inNanoseconds, Unit units = Unit::Auto) : m_inNanoseconds(inNanoseconds), m_units(units) { if (m_units == Unit::Auto) { @@ -15888,7 +16343,7 @@ class Duration { case Unit::Minutes: return m_inNanoseconds / static_cast(s_nanosecondsInAMinute); default: - return static_cast(m_inNanoseconds); + return m_inNanoseconds; } } auto unitsAsString() const -> std::string { @@ -16007,7 +16462,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig const& config) else { return{ - { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 32, ColumnInfo::Left }, + { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, ColumnInfo::Left }, { "samples mean std dev", 14, ColumnInfo::Right }, { "iterations low mean low std dev", 14, ColumnInfo::Right }, { "estimated high mean high std dev", 14, ColumnInfo::Right } @@ -16063,8 +16518,9 @@ void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) { stream << "\nNo assertions in test case"; stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl; } - if (m_config->showDurations() == ShowDurations::Always) { - stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl; + double dur = _sectionStats.durationInSeconds; + if (shouldShowDuration(*m_config, dur)) { + stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << std::endl; } if (m_headerPrinted) { m_headerPrinted = false; @@ -16324,8 +16780,10 @@ void ConsoleReporter::printSummaryDivider() { } void ConsoleReporter::printTestFilters() { - if (m_config->testSpec().hasFilters()) - stream << Colour(Colour::BrightYellow) << "Filters: " << serializeFilters( m_config->getTestsOrTags() ) << '\n'; + if (m_config->testSpec().hasFilters()) { + Colour guard(Colour::BrightYellow); + stream << "Filters: " << serializeFilters(m_config->getTestsOrTags()) << '\n'; + } } CATCH_REGISTER_REPORTER("console", ConsoleReporter) @@ -16346,6 +16804,7 @@ CATCH_REGISTER_REPORTER("console", ConsoleReporter) #include #include #include +#include namespace Catch { @@ -16373,7 +16832,7 @@ namespace Catch { #else std::strftime(timeStamp, timeStampSize, fmt, timeInfo); #endif - return std::string(timeStamp); + return std::string(timeStamp, timeStampSize-1); } std::string fileNameTag(const std::vector &tags) { @@ -16384,6 +16843,17 @@ namespace Catch { return it->substr(1); return std::string(); } + + // Formats the duration in seconds to 3 decimal places. + // This is done because some genius defined Maven Surefire schema + // in a way that only accepts 3 decimal places, and tools like + // Jenkins use that schema for validation JUnit reporter output. + std::string formatDuration( double seconds ) { + ReusableStringStream rss; + rss << std::fixed << std::setprecision( 3 ) << seconds; + return rss.str(); + } + } // anonymous namespace JunitReporter::JunitReporter( ReporterConfig const& _config ) @@ -16453,7 +16923,7 @@ namespace Catch { if( m_config->showDurations() == ShowDurations::Never ) xml.writeAttribute( "time", "" ); else - xml.writeAttribute( "time", suiteTime ); + xml.writeAttribute( "time", formatDuration( suiteTime ) ); xml.writeAttribute( "timestamp", getCurrentTimestamp() ); // Write properties if there are any @@ -16475,8 +16945,8 @@ namespace Catch { for( auto const& child : groupNode.children ) writeTestCase( *child ); - xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), false ); - xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), false ); + xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), XmlFormatting::Newline ); + xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), XmlFormatting::Newline ); } void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) { @@ -16498,12 +16968,13 @@ namespace Catch { if ( !m_config->name().empty() ) className = m_config->name() + "." + className; - writeSection( className, "", rootSection ); + writeSection( className, "", rootSection, stats.testInfo.okToFail() ); } - void JunitReporter::writeSection( std::string const& className, - std::string const& rootName, - SectionNode const& sectionNode ) { + void JunitReporter::writeSection( std::string const& className, + std::string const& rootName, + SectionNode const& sectionNode, + bool testOkToFail) { std::string name = trim( sectionNode.stats.sectionInfo.name ); if( !rootName.empty() ) name = rootName + '/' + name; @@ -16520,20 +16991,30 @@ namespace Catch { xml.writeAttribute( "classname", className ); xml.writeAttribute( "name", name ); } - xml.writeAttribute( "time", ::Catch::Detail::stringify( sectionNode.stats.durationInSeconds ) ); + xml.writeAttribute( "time", formatDuration( sectionNode.stats.durationInSeconds ) ); + // This is not ideal, but it should be enough to mimic gtest's + // junit output. + // Ideally the JUnit reporter would also handle `skipTest` + // events and write those out appropriately. + xml.writeAttribute( "status", "run" ); + + if (sectionNode.stats.assertions.failedButOk) { + xml.scopedElement("skipped") + .writeAttribute("message", "TEST_CASE tagged with !mayfail"); + } writeAssertions( sectionNode ); if( !sectionNode.stdOut.empty() ) - xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), false ); + xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), XmlFormatting::Newline ); if( !sectionNode.stdErr.empty() ) - xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), false ); + xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), XmlFormatting::Newline ); } for( auto const& childNode : sectionNode.childSections ) if( className.empty() ) - writeSection( name, "", *childNode ); + writeSection( name, "", *childNode, testOkToFail ); else - writeSection( className, name, *childNode ); + writeSection( className, name, *childNode, testOkToFail ); } void JunitReporter::writeAssertions( SectionNode const& sectionNode ) { @@ -16551,11 +17032,7 @@ namespace Catch { elementName = "error"; break; case ResultWas::ExplicitFailure: - elementName = "failure"; - break; case ResultWas::ExpressionFailed: - elementName = "failure"; - break; case ResultWas::DidntThrowException: elementName = "failure"; break; @@ -16573,10 +17050,25 @@ namespace Catch { XmlWriter::ScopedElement e = xml.scopedElement( elementName ); - xml.writeAttribute( "message", result.getExpandedExpression() ); + xml.writeAttribute( "message", result.getExpression() ); xml.writeAttribute( "type", result.getTestMacroName() ); ReusableStringStream rss; + if (stats.totals.assertions.total() > 0) { + rss << "FAILED" << ":\n"; + if (result.hasExpression()) { + rss << " "; + rss << result.getExpressionInMacro(); + rss << '\n'; + } + if (result.hasExpandedExpression()) { + rss << "with expansion:\n"; + rss << Column(result.getExpandedExpression()).indent(2) << '\n'; + } + } else { + rss << '\n'; + } + if( !result.getMessage().empty() ) rss << result.getMessage() << '\n'; for( auto const& msg : stats.infoMessages ) @@ -16584,7 +17076,7 @@ namespace Catch { rss << msg.message << '\n'; rss << "at " << result.getSourceInfo(); - xml.writeText( rss.str(), false ); + xml.writeText( rss.str(), XmlFormatting::Newline ); } } @@ -16930,9 +17422,9 @@ namespace Catch { e.writeAttribute( "durationInSeconds", m_testCaseTimer.getElapsedSeconds() ); if( !testCaseStats.stdOut.empty() ) - m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), false ); + m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), XmlFormatting::Newline ); if( !testCaseStats.stdErr.empty() ) - m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), false ); + m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), XmlFormatting::Newline ); m_xml.endElement(); } @@ -16944,6 +17436,10 @@ namespace Catch { .writeAttribute( "successes", testGroupStats.totals.assertions.passed ) .writeAttribute( "failures", testGroupStats.totals.assertions.failed ) .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk ); + m_xml.scopedElement( "OverallResultsCases") + .writeAttribute( "successes", testGroupStats.totals.testCases.passed ) + .writeAttribute( "failures", testGroupStats.totals.testCases.failed ) + .writeAttribute( "expectedFailures", testGroupStats.totals.testCases.failedButOk ); m_xml.endElement(); } @@ -16953,6 +17449,10 @@ namespace Catch { .writeAttribute( "successes", testRunStats.totals.assertions.passed ) .writeAttribute( "failures", testRunStats.totals.assertions.failed ) .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk ); + m_xml.scopedElement( "OverallResultsCases") + .writeAttribute( "successes", testRunStats.totals.testCases.passed ) + .writeAttribute( "failures", testRunStats.totals.testCases.failed ) + .writeAttribute( "expectedFailures", testRunStats.totals.testCases.failedButOk ); m_xml.endElement(); } @@ -16966,16 +17466,16 @@ namespace Catch { m_xml.writeAttribute("samples", info.samples) .writeAttribute("resamples", info.resamples) .writeAttribute("iterations", info.iterations) - .writeAttribute("clockResolution", static_cast(info.clockResolution)) - .writeAttribute("estimatedDuration", static_cast(info.estimatedDuration)) + .writeAttribute("clockResolution", info.clockResolution) + .writeAttribute("estimatedDuration", info.estimatedDuration) .writeComment("All values in nano seconds"); } void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) { m_xml.startElement("mean") - .writeAttribute("value", static_cast(benchmarkStats.mean.point.count())) - .writeAttribute("lowerBound", static_cast(benchmarkStats.mean.lower_bound.count())) - .writeAttribute("upperBound", static_cast(benchmarkStats.mean.upper_bound.count())) + .writeAttribute("value", benchmarkStats.mean.point.count()) + .writeAttribute("lowerBound", benchmarkStats.mean.lower_bound.count()) + .writeAttribute("upperBound", benchmarkStats.mean.upper_bound.count()) .writeAttribute("ci", benchmarkStats.mean.confidence_interval); m_xml.endElement(); m_xml.startElement("standardDeviation") @@ -17026,7 +17526,7 @@ namespace Catch { #ifndef __OBJC__ -#if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN) +#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN) // Standard C/C++ Win32 Unicode wmain entry point extern "C" int wmain (int argc, wchar_t * argv[], wchar_t * []) { #else @@ -17159,9 +17659,9 @@ int main (int argc, char * const argv[]) { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) #define CATCH_BENCHMARK(...) \ - INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) + INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) #define CATCH_BENCHMARK_ADVANCED(name) \ - INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name) + INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), name) #endif // CATCH_CONFIG_ENABLE_BENCHMARKING // If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required @@ -17263,9 +17763,9 @@ int main (int argc, char * const argv[]) { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) #define BENCHMARK(...) \ - INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) + INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) #define BENCHMARK_ADVANCED(name) \ - INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name) + INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), name) #endif // CATCH_CONFIG_ENABLE_BENCHMARKING using Catch::Detail::Approx; @@ -17312,8 +17812,8 @@ using Catch::Detail::Approx; #define CATCH_WARN( msg ) (void)(0) #define CATCH_CAPTURE( msg ) (void)(0) -#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #define CATCH_METHOD_AS_TEST_CASE( method, ... ) #define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0) #define CATCH_SECTION( ... ) @@ -17322,7 +17822,7 @@ using Catch::Detail::Approx; #define CATCH_FAIL_CHECK( ... ) (void)(0) #define CATCH_SUCCEED( ... ) (void)(0) -#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) @@ -17345,8 +17845,8 @@ using Catch::Detail::Approx; #endif // "BDD-style" convenience wrappers -#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className ) +#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), className ) #define CATCH_GIVEN( desc ) #define CATCH_AND_GIVEN( desc ) #define CATCH_WHEN( desc ) @@ -17394,10 +17894,10 @@ using Catch::Detail::Approx; #define INFO( msg ) (void)(0) #define UNSCOPED_INFO( msg ) (void)(0) #define WARN( msg ) (void)(0) -#define CAPTURE( msg ) (void)(0) +#define CAPTURE( ... ) (void)(0) -#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #define METHOD_AS_TEST_CASE( method, ... ) #define REGISTER_TEST_CASE( Function, ... ) (void)(0) #define SECTION( ... ) @@ -17405,7 +17905,7 @@ using Catch::Detail::Approx; #define FAIL( ... ) (void)(0) #define FAIL_CHECK( ... ) (void)(0) #define SUCCEED( ... ) (void)(0) -#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) @@ -17435,8 +17935,8 @@ using Catch::Detail::Approx; #define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature ) // "BDD-style" convenience wrappers -#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ) ) -#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className ) +#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ) ) +#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), className ) #define GIVEN( desc ) #define AND_GIVEN( desc ) @@ -17467,4 +17967,3 @@ using Catch::Detail::Approx; // end catch_reenable_warnings.h // end catch.hpp #endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED - From bf5eafab0c0b8fbe374a8755efd7f1448a87acb2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 2 Sep 2022 12:22:54 +0100 Subject: [PATCH 186/254] Bump src/3rd_party/intgemm from `a05a2e5` to `0eda93a` (#933) Bumps [src/3rd_party/intgemm](https://github.com/marian-nmt/intgemm) from `a05a2e5` to `0eda93a`. - [Release notes](https://github.com/marian-nmt/intgemm/releases) - [Commits](https://github.com/marian-nmt/intgemm/compare/a05a2e51ab524bcee954a39ee72005193f3adf7c...0eda93a95a4472af0a50c78b5df58e7fc459ac7a) --- updated-dependencies: - dependency-name: src/3rd_party/intgemm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/3rd_party/intgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/intgemm b/src/3rd_party/intgemm index a05a2e51a..0eda93a95 160000 --- a/src/3rd_party/intgemm +++ b/src/3rd_party/intgemm @@ -1 +1 @@ -Subproject commit a05a2e51ab524bcee954a39ee72005193f3adf7c +Subproject commit 0eda93a95a4472af0a50c78b5df58e7fc459ac7a From 7d654603bf6abaa6a0e474f1bb7551cddfcb4c67 Mon Sep 17 00:00:00 2001 From: Jelmer Date: Fri, 2 Sep 2022 12:25:42 +0100 Subject: [PATCH 187/254] Fix guaranteed `YAML::InvalidNode` when compiled with `COMPILE_CPU=Off` (#944) --- src/common/config_validator.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index b0230da99..6c6b002aa 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -54,8 +54,10 @@ void ConfigValidator::validateOptionsTranslation() const { ABORT_IF(models.empty() && configs.empty(), "You need to provide at least one model file or a config file"); +#ifdef COMPILE_CPU ABORT_IF(get("model-mmap") && get("cpu-threads") == 0, "Model MMAP is CPU-only, please use --cpu-threads"); +#endif for(const auto& modelFile : models) { filesystem::Path modelPath(modelFile); From 0afe2478aac5605fe78155663543f3b4232f5269 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 2 Sep 2022 16:30:45 +0100 Subject: [PATCH 188/254] Upgrade workflows to ubuntu-20.04 and macos-12 (#962) * Upgrade workflows to ubuntu-20.04 and macos-12 * Update sentencepiece module * Install libunwind-dev on ubuntu-22.04 images * Update simple-websocket-server module --- .github/workflows/macos.yml | 6 ++-- .github/workflows/ubuntu.yml | 44 +++++++++++++++------------ src/3rd_party/sentencepiece | 2 +- src/3rd_party/simple-websocket-server | 2 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index c16213793..f06eed256 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -9,7 +9,7 @@ on: jobs: build-macos: name: MacOS CPU-only - runs-on: macos-10.15 + runs-on: macos-12 steps: - name: Checkout @@ -18,10 +18,12 @@ jobs: submodules: recursive - name: Install dependencies - run: brew install boost openssl protobuf + run: brew install boost openblas openssl protobuf - name: Configure CMake run: | + export LDFLAGS="-L/usr/local/opt/openblas/lib" + export CPPFLAGS="-I/usr/local/opt/openblas/include" mkdir -p build cd build cmake .. \ diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a889df16c..bc01b74a8 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -13,56 +13,58 @@ jobs: include: # Ubuntu CPU-only build - name: "Ubuntu CPU-only" - os: ubuntu-18.04 + os: ubuntu-20.04 cuda: "" - gcc: 7 + gcc: 9 clang: "" cpu: true gpu: false unit_tests: true examples: false # Using Clang compiler - - name: "Ubuntu CPU-only clang-12" - os: ubuntu-20.04 + - name: "Ubuntu CPU-only clang-14" + os: ubuntu-22.04 cuda: "" gcc: "" - clang: 12 + clang: 14 cpu: true gpu: false unit_tests: true examples: false # Ubuntu GPU-only build - name: "Ubuntu GPU-only" - os: ubuntu-18.04 - cuda: "10.2" - gcc: 7 + os: ubuntu-20.04 + cuda: "11.1" + gcc: 9 clang: "" cpu: false gpu: true unit_tests: false examples: true - # Ubuntu 20.04 supports CUDA 11+ + # Ubuntu 22.04 supports CUDA 11.7 # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 20.04 CUDA 11.2 gcc-9" - os: ubuntu-20.04 - cuda: "11.2" - gcc: 9 + - name: "Ubuntu 22.04 CUDA 11.7 gcc-11" + os: ubuntu-22.04 + cuda: "11.7" + gcc: 11 clang: "" cpu: false gpu: true unit_tests: false examples: false - # Ubuntu 18.04 supports CUDA 10.1+ + # Ubuntu 20.04 supports CUDA 11+ # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 18.04 CUDA 10.2 gcc-8" - os: ubuntu-18.04 - cuda: "10.2" - gcc: 8 + - name: "Ubuntu 20.04 CUDA 11.1 gcc-9" + os: ubuntu-20.04 + cuda: "11.1" + gcc: 9 clang: "" cpu: true gpu: true unit_tests: false examples: false + # Ubuntu 18.04 supports CUDA 10.1+ + # But it will soon be removed from GitHub workflows # Ubuntu 16.04 supports CUDA 8+ # But it is no longer available in GitHub workflows @@ -78,10 +80,12 @@ jobs: # The following packages are already installed on GitHub-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev # Boost is no longer pre-installed on GitHub-hosted runners - # Clang 12.0 is pre-installed on the ubuntu-20.04 image + # Clang 12, 13 and 14 are pre-installed on the ubuntu-22.04 image + # Note that installation of libunwind-dev is a bug fix for ubuntu-22.04 images on Azure/GitHub-hosted machines + # and is normally not required - name: Install dependencies run: | - sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev + sudo apt-get install -y libunwind-dev libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev [ -z "${{ matrix.gcc }}" ] || sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 5312a306c..31ac8e887 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 5312a306c4c0a458e29a8882ebfb42a179aaf580 +Subproject commit 31ac8e88760f48d31843eeed36136458df0f60aa diff --git a/src/3rd_party/simple-websocket-server b/src/3rd_party/simple-websocket-server index 1d7e84aeb..8909c57b5 160000 --- a/src/3rd_party/simple-websocket-server +++ b/src/3rd_party/simple-websocket-server @@ -1 +1 @@ -Subproject commit 1d7e84aeb3f1ebdc78f6965d79ad3ca3003789fe +Subproject commit 8909c57b5473cb95e197fa7f034edabb474535ba From 347ab4d54a841041dab095d3439c5098dff5090f Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 5 Sep 2022 16:51:48 +0100 Subject: [PATCH 189/254] Upgrade dependencies in the documentation framework (#965) * Upgrade lxml to 4.9.1; set docutils no higher than 0.17 * Remove mistune<2.0.3 due to potential vulnerability * Fix badges in docs * Update CHANGELOG --- CHANGELOG.md | 5 +++-- doc/.gitignore | 1 + doc/Makefile | 3 ++- doc/conf.py | 13 ------------- doc/index.rst | 22 +++++++++++----------- doc/requirements.txt | 4 ++-- 6 files changed, 19 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6614909e..439a0c3dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,17 +13,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Fixed - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) - Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) -- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. +- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. - Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. - Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load - Fixed check for `fortran_ordering` in cnpy - Fixed fp16 training/inference with factors-combine concat method - Fixed clang 13.0.1 compatibility +- Fixed potential vulnerabilities from lxml<4.9.1 or mistune<2.0.3 ### Changed - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce -- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. +- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. - Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 diff --git a/doc/.gitignore b/doc/.gitignore index 4d192b770..47b72ab07 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -2,3 +2,4 @@ api build doxygen venv +CONTRIBUTING.md diff --git a/doc/Makefile b/doc/Makefile index 84310d9dc..aa2048b8c 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -14,10 +14,11 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile + cp $(SOURCEDIR)/../CONTRIBUTING.md $(SOURCEDIR)/ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Clean target as recommended by Exhale # https://exhale.readthedocs.io/en/latest/usage.html#optional-create-a-proper-clean-target clean: - rm -rf doxygen/ api/ + rm -rf doxygen/ api/ $(SOURCEDIR)/CONTRIBUTING.md @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/conf.py b/doc/conf.py index b0c68bcdf..192dd27dd 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -114,16 +114,3 @@ primary_domain = 'cpp' highlight_language = 'cpp' - -# A trick to include markdown files from outside the source directory using -# 'mdinclude'. Warning: all other markdown files not included via 'mdinclude' -# will be rendered using recommonmark as recommended by Sphinx -from m2r import MdInclude - -def setup(app): - # from m2r to make `mdinclude` work - app.add_config_value('no_underscore_emphasis', False, 'env') - app.add_config_value('m2r_parse_relative_links', False, 'env') - app.add_config_value('m2r_anonymous_references', False, 'env') - app.add_config_value('m2r_disable_inline_math', False, 'env') - app.add_directive('mdinclude', MdInclude) diff --git a/doc/index.rst b/doc/index.rst index d19bb4b00..9d769c32d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,7 +1,7 @@ Welcome to Marian's documentation! ================================== -|buildgpu| |buildcpu| |tests| |release| |license| +|ubuntu| |windows| |macos| |release| |license| Marian is an efficient and self-contained Neural Machine Translation framework with an integrated automatic differentiation engine based on dynamic computation graphs, written entirely in C++. @@ -19,7 +19,7 @@ This is developer documentation. User documentation is available at https://mari factors api/library_index - contributing + CONTRIBUTING doc_guide @@ -30,17 +30,17 @@ Indices and tables * :ref:`genindex` -.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDAC%20Build - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/ - :alt: GPU build status +.. |ubuntu| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml + :alt: Ubuntu build status -.. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU%20Build - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/ - :alt: CPU build status +.. |windows| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml + :alt: Windows build status -.. |tests| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=Tests - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/ - :alt: Tests status +.. |macos| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml + :alt: MacOS build status .. |release| image:: https://img.shields.io/github/release/marian-nmt/marian.svg?label=Release :target: https://github.com/marian-nmt/marian/releases diff --git a/doc/requirements.txt b/doc/requirements.txt index 40de5ddd9..a2f87dd91 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,9 +1,9 @@ +lxml>=4.9.1 +docutils<=0.17 sphinx==2.4.4 breathe==4.13.0 exhale sphinx_rtd_theme myst-parser==0.14.0a3 -mistune<2.0.0 -m2r sphinx-mathjax-offline Jinja2<3.1 From b6d066794e616cee6f2c22cced33987e5e3af245 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Sep 2022 16:52:16 +0100 Subject: [PATCH 190/254] Bump regression-tests from `4fa9ff5` to `92e116e` (#964) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `4fa9ff5` to `92e116e`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/4fa9ff55af68bc87d8bd04c9b410f1e1d3874718...92e116efa369d6ed848c8eb19dfcef8bf7245d71) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 4fa9ff55a..92e116efa 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 4fa9ff55af68bc87d8bd04c9b410f1e1d3874718 +Subproject commit 92e116efa369d6ed848c8eb19dfcef8bf7245d71 From a5223e28ae47cd888dff84df1c8847ddcd9e5e13 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Sep 2022 16:52:36 +0100 Subject: [PATCH 191/254] Bump examples from `29f4f7c` to `25e8438` (#963) Bumps [examples](https://github.com/marian-nmt/marian-examples) from `29f4f7c` to `25e8438`. - [Release notes](https://github.com/marian-nmt/marian-examples/releases) - [Commits](https://github.com/marian-nmt/marian-examples/compare/29f4f7c380c860a95b9375813f4b199b2e6b5556...25e84383225a29f769e362250654ddf256d06261) --- updated-dependencies: - dependency-name: examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 29f4f7c38..25e843832 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 29f4f7c380c860a95b9375813f4b199b2e6b5556 +Subproject commit 25e84383225a29f769e362250654ddf256d06261 From 6b41df2a446ddd3c519993b1b116a8bb4926fc0f Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 5 Sep 2022 16:53:55 +0100 Subject: [PATCH 192/254] Version 1.11.8 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 77418c859..3037ff212 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.7 +v1.11.8 From a47912d9f1b8d26e2ab767decc018c0cb6e56006 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 15 Sep 2022 06:18:42 +0000 Subject: [PATCH 193/254] Merged PR 25518: Upgrade Azure Pipelines to macos-12 macos-10.15 will become unsupported in December 2022. Changes: * Upgrade Azure DevOps to macos-12 * Pull https://github.com/marian-nmt/sentencepiece/pull/14 * Fix clang 13 errors as in https://github.com/marian-nmt/marian-dev/pull/939 --- azure-pipelines.yml | 2 +- src/3rd_party/sentencepiece | 2 +- src/training/communicator.h | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a6fea5da6..0c7bd9c72 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -328,7 +328,7 @@ stages: displayName: macOS CPU clang pool: - vmImage: macos-10.15 + vmImage: macos-12 steps: - checkout: self diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 5312a306c..31ac8e887 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 5312a306c4c0a458e29a8882ebfb42a179aaf580 +Subproject commit 31ac8e88760f48d31843eeed36136458df0f60aa diff --git a/src/training/communicator.h b/src/training/communicator.h index c24caadcd..5ab1b6b27 100644 --- a/src/training/communicator.h +++ b/src/training/communicator.h @@ -130,7 +130,6 @@ class DefaultCommunicator : public ICommunicator { int totalSize = (int)graphs_[0]->params()->vals()->size(); int shardSize = (int)ceil(totalSize / (float)graphs_.size()); - int pos = 0; for(auto graph : graphs_) { int __size__ = std::min(shardSize, totalSize); @@ -145,7 +144,6 @@ class DefaultCommunicator : public ICommunicator { tmpTensors_.push_back(tmp); // move to next shard - pos += __size__; totalSize -= __size__; } } From 6f7766f8378693f8b8672b255c14de2d3c567328 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 15 Sep 2022 06:19:18 +0000 Subject: [PATCH 194/254] Merged PR 25465: Choose top checkpoints from train.log for averaging Added `--from-log logfile N metric asc|desc` option to `average.py`, which selects top N checkpoint paths from the provided train.log file according to the selected metric. Last 3 arguments to this option are optional. If the last argument is omitted, "asc" is assumed for perplexity and "desc" for other metrics. --- scripts/checkpoints/average.py | 168 +++++++++++++++++++++++++-------- 1 file changed, 128 insertions(+), 40 deletions(-) diff --git a/scripts/checkpoints/average.py b/scripts/checkpoints/average.py index da1ca2526..89a73e16a 100755 --- a/scripts/checkpoints/average.py +++ b/scripts/checkpoints/average.py @@ -1,55 +1,143 @@ #!/usr/bin/env python3 """ -This script takes multiple Marian *.npz model files and outputs an elementwise average of the model, -meant to do check-point averaging from: +This script takes multiple Marian *.npz model files and outputs an elementwise +average of the model, meant to do check-point averaging from: https://www.aclweb.org/anthology/W16-2316 -usage: +usage examples: ./average.py -m model.1.npz model.2.npz --output model.avg.npz +./average.py --from-log train.log 2 chrf --output model.avg.npz """ from __future__ import print_function +import argparse +import numpy as np import os +import re import sys -import argparse -import numpy as np -# Parse arguments -parser = argparse.ArgumentParser() -parser.add_argument('-m', '--model', nargs='+', required=True, - help="models to average") -parser.add_argument('-o', '--output', required=True, - help="output path") -args = parser.parse_args() - -# *average* holds the model matrix -average = dict() -# No. of models. -n = len(args.model) - -for filename in args.model: - print("Loading {}".format(filename)) - with open(filename, "rb") as mfile: - # Loads matrix from model file - m = np.load(mfile) - for k in m: - if k != "history_errs": - # Initialize the key - if k not in average: - average[k] = m[k] - # Add to the appropriate value - elif average[k].shape == m[k].shape and "special" not in k: - average[k] += m[k] - -# Actual averaging -for k in average: - if "special" not in k: - average[k] /= n - -# Save averaged model to file -print("Saving to {}".format(args.output)) -np.savez(args.output, **average) +def main(): + args = parse_cmd_args() + + if args.from_log: + models = find_best_models(*args.from_log) + else: + models = args.model + + print("Averaging models: {}".format(" ".join(models))) + average = average_models(models) + + # Save averaged model to file + print("Saving to {}".format(args.output)) + np.savez(args.output, **average) + + +def average_models(models): + average = dict() # Holds the model matrix + n = len(models) # No. of models. + + for filename in models: + print("Loading {}".format(filename)) + with open(filename, "rb") as mfile: + # Loads matrix from model file + m = np.load(mfile) + for k in m: + if k != "history_errs": + # Initialize the key + if k not in average: + average[k] = m[k] + # Add to the appropriate value + elif average[k].shape == m[k].shape and "special" not in k: + average[k] += m[k] + + # Actual averaging + for k in average: + if "special" not in k: + average[k] /= n + + return average + + +def find_best_models(logs, best=5, metric='chrf', order=None): + best = int(best) + if order is None: # Try to set ordering automatically + order = 'asc' if metric == 'perplexity' else 'desc' + print( + "Taking {} best checkpoints according to {}/{} from {}".format( + best, metric, order, logs + ) + ) + + match_model = re.compile( + r'Saving model weights and runtime parameters to (?P.*\.iter\d+\.npz)' + ) + match_valid = re.compile( + r'\[valid\] Ep\. [\d\.]+ : ' + r'Up\. (?P[\d\.]+) : ' + r'(?P[^ ]+) : ' + r'(?P[\d\.]+) :' + ) + # Find model.iterXYZ.npz files and validation scores + lines = [] # [(checkpoint, update, { metric: value })] + with open(logs, "r") as logfile: + for line in logfile: + m = match_model.search(line) + if m: + model = m.group("model") + lines.append([model, None, {}]) + continue + m = match_valid.search(line) + if m: + update = m.group("update") + name = m.group("metric") + value = float(m.group("value")) + if metric not in lines[-1][-1]: + lines[-1][1] = update + lines[-1][-1][name] = value + + # Check if the requested metric is found + metrics = lines[0][-1].keys() + if metric not in metrics: + raise ValueError( + "metric '{}' not found in {}, choose from: {}".format( + metric, logs, " ".join(metrics) + ) + ) + exit(1) + + # Select best N checkpoints + models_all = [(line[0], line[2][metric]) for line in lines] + reverse = True if order.lower() == 'desc' else False + models_top = sorted(models_all, key=lambda p: p[1], reverse=reverse)[:best] + + print("Selected checkpoints:") + for model, value in models_top: + print(" {} {}={:.4f}".format(model, metric, value)) + + return [p[0] for p in models_top] + + +def parse_cmd_args(): + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument('-m', '--model', nargs='+', help="models to average") + parser.add_argument( + '--from-log', + nargs='+', + metavar="ARG", + help="average from train.log, args: path N metric", + ) + parser.add_argument('-o', '--output', required=True, help="output path") + args = parser.parse_args() + + if (not args.model and not args.from_log) or (args.model and args.from_log): + parser.error('either -m/--model or --from-log must be set') + return args + + +if __name__ == "__main__": + main() From e13053a6f2e112886839f270e5a0f2d4ecf93fc7 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 16 Sep 2022 09:30:10 +0000 Subject: [PATCH 195/254] Merged PR 25698: Install Python 3.8 on GPU pool Python >= 3.8 is required for numpy >= 1.22, which is the minimum version without vulnerability issues. --- azure-regression-tests.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index d6053e53c..c849b59df 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -30,10 +30,15 @@ stages: # librt.* from the default anaconda environment are deleted because they crash the linker at the # end of compilation. This is an issue with the pre-defined VM image that is used for the Pool # and will not persist for other images - # TODO: There should be no need to install python3 - bash: | rm -f /anaconda/envs/py38_default/x86_64-conda-linux-gnu/sysroot/usr/lib/librt.* - sudo apt-get install -y gcc-8 g++-8 p7zip-full python3-pip + sudo apt-get install -y gcc-8 g++-8 p7zip-full + # TODO: There should be no need to install python3 + sudo apt-get install -y python3.8 python3.8-dev python3.8-distutils python3.8-venv + sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 + sudo update-alternatives --set python3 /usr/bin/python3.8 + sudo apt-get install -y python3-pip + python3 -m pip install --upgrade Cython displayName: Clean and install packages # Collect details about CPU and GPU. From 76964791ad6b56c682f3188fe8e256592bb5e0e0 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Sep 2022 22:53:08 +0000 Subject: [PATCH 196/254] Merged PR 23767: More principled sampling and force-decoding This PR adds correct force-decoding and more principled sampling, both should now work for ensembles, batches and with beam search. --- src/common/config_parser.cpp | 12 ++- src/data/corpus_base.cpp | 16 +++- src/graph/node_operators_binary.h | 6 +- src/models/costs.cpp | 35 ------- src/models/costs.h | 26 ----- src/models/model_factory.cpp | 23 +---- src/tensors/gpu/tensor_operators.cu | 12 +++ src/translator/beam_search.cpp | 28 ++++-- src/translator/beam_search.h | 1 + src/translator/sampling.h | 141 ++++++++++++++++++++++++++++ 10 files changed, 199 insertions(+), 101 deletions(-) create mode 100644 src/translator/sampling.h diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 404b43f1e..338933d9f 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -681,6 +681,11 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { cli.add("--alignment", "Return word alignment. Possible values: 0.0-1.0, hard, soft") ->implicit_val("1"); + cli.add("--force-decode", + "Use force-decoding of given prefixes. Forces decoding to follow vocab IDs from last stream in the batch (or the first stream, if there is only one). " + "Use either as `./marian-decoder --force-decode --input source.txt prefixes.txt [...]` where inputs and prefixes align on line-level or as " + "`paste source.txt prefixes.txt | ./marian-decoder --force-decode --tsv --tsv-fields 2 [...]` when reading from stdin." + ); cli.add("--word-scores", "Print word-level scores. One score per subword unit, not normalized even if --normalize"); cli.add("--stat-freq", @@ -709,9 +714,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { cli.add>("--weights", "Scorer weights"); cli.add>("--output-sampling", - "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. " - " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.") - ->implicit_val("full"); + "Noise output layer with gumbel noise. Implicit default is 'full 1.0' for sampling from full distribution" + " with softmax temperature 1.0. Also accepts 'topk num temp' (e.g. topk 100 0.1) for top-100 sampling with" + " temperature 0.1") + ->implicit_val("full 1.0"); cli.add>("--output-approx-knn", "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 71c9f9908..addcc3bfa 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -347,11 +347,18 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) auto vocabDims = options_->get>("dim-vocabs"); vocabDims.resize(numVocs, 0); - for(size_t i = 0; i + 1 < numVocs; ++i) { + + // when force-decoding we want the last vocab to be part of the batch, + // hence we do not drop it from the input batch. + bool forceDecoding = options_->get("force-decode", false); + size_t shift = !forceDecoding ? 1 : 0; + + for(size_t i = 0; i + shift < numVocs; ++i) { Ptr vocab = New(options_, i); vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]); vocabs_.emplace_back(vocab); - } + } + // TODO: As above, this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab // creation functionality from the class. @@ -368,10 +375,11 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) } } - ABORT_IF(!tsv_ && vocabs_.size() != files_.size(), + size_t numStreams = files_.size(); + ABORT_IF(!tsv_ && vocabs_.size() != numStreams, "Number of {} files ({}) and vocab files ({}) does not agree", training ? "corpus" : "input", - files_.size(), + numStreams, vocabs_.size()); // Handle guided alignment and data weighting files. Alignments and weights in TSV input were diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index f46e0b899..7a4824ef0 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1040,7 +1040,7 @@ struct GatherNodeOp : public NaryNodeOp { NodeOps backwardOps() override { return {NodeOp( // @TODO: rename to scatter - Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; + Insert(child(0)->grad(), adj_, /*indices=*/child(1)->val(), axis_))}; } Shape newShape(Expr a, int axis, Expr indices) { @@ -1097,7 +1097,7 @@ struct ScatterNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( CopyCast(val_, child(0)->val()); // @TODO: use normal copy - Insert(val_, child(2)->val(), child(1)->val(), axis_) + Insert(val_, /*source=*/child(2)->val(), /*indices=*/child(1)->val(), axis_) )}; } @@ -1107,7 +1107,7 @@ struct ScatterNodeOp : public NaryNodeOp { Shape newShape(Expr a, int axis, Expr indices, Expr source) { ABORT_IF(axis != -1, "only last dimensions"); - ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); + // ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); or broadcast Shape shape = a->shape(); // @TODO: do proper checking diff --git a/src/models/costs.cpp b/src/models/costs.cpp index 4b15bcb36..c688b2119 100644 --- a/src/models/costs.cpp +++ b/src/models/costs.cpp @@ -10,40 +10,5 @@ Ptr LogSoftmaxStep::apply(Ptr state) { return state; } -Ptr GumbelSoftmaxStep::apply(Ptr state) { - state->setLogProbs(state->getLogProbs().applyUnaryFunctions( - [](Expr logits) { // lemma gets gumbelled - return logsoftmax(logits + constant_like(logits, inits::gumbel())); - }, - logsoftmax)); // factors don't - return state; -} - -TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {} - -Ptr TopkGumbelSoftmaxStep::apply(Ptr state) { - state->setLogProbs(state->getLogProbs().applyUnaryFunctions( - [=](Expr logits) { // lemma gets gumbelled - // create logits-sized tensor consisting only of invalid path scores - float invalidPathScore = NumericLimits(logits->value_type()).lowest; - Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore)); - - // select top-k values - Expr val, idx; - std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true); - - // uncomment below to display probability mass in top-k selection - // debug(sum(gather(softmax(logits), -1, idx), -1), "sum"); - - // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search - Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); - - // Scatter gumbelled values back into logits to fill with usable values - return scatter(invalidLogits, -1, idx, gumbelVal); - }, - logsoftmax)); // factors don't - return state; -} - } // namespace models } // namespace marian diff --git a/src/models/costs.h b/src/models/costs.h index 9bb2b1039..f1c9931a3 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -297,32 +297,6 @@ class LogSoftmaxStep : public ILogProbStep { virtual Ptr apply(Ptr state) override; }; -// Gumbel-max noising for sampling during translation. -// Produces accurate sampling with beam=1. Turn on -// with --output-sampling [full] during translation -// with marian-decoder for samnpling from the full -// softmax distribution. -class GumbelSoftmaxStep : public ILogProbStep { -public: - virtual ~GumbelSoftmaxStep() {} - virtual Ptr apply(Ptr state) override; -}; - - -// Gumbel-max noising for top-k sampling during translation. -// Produces accurate sampling with beam=1. Turn on -// with --output-sampling topk [10] during translation -// with marian-decoder for top-10 sampling. -class TopkGumbelSoftmaxStep : public ILogProbStep { -private: - int k_{1}; - -public: - TopkGumbelSoftmaxStep(int k); - virtual ~TopkGumbelSoftmaxStep() {} - virtual Ptr apply(Ptr state) override; -}; - // class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence, // wrapped again in the IEncoderDecoder interface // @TODO: seems we are conflating an interface defition with its implementation? diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 394344f72..5a317019d 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -370,28 +370,7 @@ Ptr createModelFromOptions(Ptr options, usage use) { // add (log)softmax if requested if (use == usage::translation) { if(std::dynamic_pointer_cast(baseModel)) { - if(options->hasAndNotEmpty("output-sampling")) { - auto sampling = options->get>("output-sampling", {}); - std::string method = sampling.size() > 0 ? sampling[0] : "full"; - - if(method == "0") { /*for backwards-compat when output-sampling: false in yaml file*/ - // do normal decoding - return New(std::dynamic_pointer_cast(baseModel), New()); - } else if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { - LOG(info, "Output sampling from the full softmax distribution"); - return New(std::dynamic_pointer_cast(baseModel), New()); - } else if(method == "topk") { - int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10; - if(k == 1) - LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); - LOG(info, "Output sampling via top-{} sampling", k); - return New(std::dynamic_pointer_cast(baseModel), New(k)); - } else { - ABORT("Unknown sampling method: {}", method); - } - } else { - return New(std::dynamic_pointer_cast(baseModel), New()); - } + return New(std::dynamic_pointer_cast(baseModel), New()); } #ifdef COMPILE_EXAMPLES // note: 'usage::translation' here means 'inference' diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 9011f284a..51e6f2f2d 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -199,6 +199,8 @@ void CopyCastFrom(Tensor out, const T* in, int length) { #endif } else if(out->type() == Type::float64) { CopyCastTo(out->data(), in, length); + } else if(out->type() == Type::uint32) { + CopyCastTo(out->data(), in, length); } else { ABORT("CopyCastTo to type {} not implemented", out->type()); } @@ -313,6 +315,8 @@ void Concatenate1(Tensor out, const std::vector& inputs) { } else if(out->type() == Type::float16) { gInsertCols<<>>(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); #endif + } else if(out->type() == Type::uint32) { + gInsertCols<<>>(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); } else { ABORT("Concatenate1 not implemented for type {}", out->type()); } @@ -392,6 +396,14 @@ void Concatenate2(Tensor out, Tensor in1, Tensor in2) { in2->data(), rowStride2); #endif + } else if(out->type() == Type::uint32) { + gJoin2<<>>(out->data(), + rowBatch, + cols, + in1->data(), + rowStride1, + in2->data(), + rowStride2); } else { ABORT("Concatenate2 not implemented for type {}", out->type()); } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 580895f2f..901eddc5c 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -1,10 +1,9 @@ -#include "translator/beam_search.h" - +#include "common/utils.h" #include "data/factored_vocab.h" -#include "translator/helpers.h" -#include "translator/nth_element.h" #include "data/shortlist.h" -#include "common/utils.h" +#include "translator/beam_search.h" +#include "translator/helpers.h" +#include "translator/sampling.h" namespace marian { @@ -316,6 +315,8 @@ Histories BeamSearch::search(Ptr graph, Ptr suppressedWordIndices = graph->indices(suppressed); } + auto distMod = New(options_, batch, INVALID_PATH_SCORE); + // the decoding process updates the following state information in each output time step: // - beams: array [origDimBatch] of array [maxBeamSize] of Hypothesis // - current output time step's set of active hypotheses, aka active search space @@ -413,9 +414,9 @@ Histories BeamSearch::search(Ptr graph, Ptr //********************************************************************** // compute expanded path scores with word prediction probs from all scorers - auto expandedPathScores = prevPathScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab] - Expr logProbs; + Expr stepScores; for(size_t i = 0; i < scorers_.size(); ++i) { + Expr logProbs; if (factorGroup == 0) { // compute output probabilities for current output time step // - uses hypIndices[index in beam, 1, batch index, 1] to reorder scorer state to reflect the top-N in beams[][] @@ -449,10 +450,19 @@ Histories BeamSearch::search(Ptr graph, Ptr logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, /*shortlist=*/ nullptr, hypIndices, maxBeamSize); // [maxBeamSize, 1, currentDimBatch, dimVocab] } // expand all hypotheses, [maxBeamSize, 1, currentDimBatch, 1] -> [maxBeamSize, 1, currentDimBatch, dimVocab] - expandedPathScores = expandedPathScores + scorers_[i]->getWeight() * logProbs; + if(i == 0) + stepScores = scorers_[i]->getWeight() * logProbs; + else + stepScores = stepScores + scorers_[i]->getWeight() * logProbs; + } + + if(factorGroup == 0) { + stepScores = distMod->force(stepScores, (int)t, (int)maxBeamSize, batchIndices); + stepScores = distMod->sample(stepScores); } // make beams continuous + auto expandedPathScores = prevPathScores + stepScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab] expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab] // perform NN computation @@ -463,6 +473,7 @@ Histories BeamSearch::search(Ptr graph, Ptr //********************************************************************** // suppress specific symbols if not at right positions + // @TODO: move this to DistributionModifier if(suppressedWordIndices && factorGroup == 0) suppressWords(expandedPathScores, suppressedWordIndices); @@ -477,6 +488,7 @@ Histories BeamSearch::search(Ptr graph, Ptr /*out*/ nBestPathScores, /*out*/ nBestKeys, /*first=*/t == 0 && factorGroup == 0); // @TODO: this is only used for checking presently, and should be removed altogether + // Now, nBestPathScores contain N-best expandedPathScores for each batch and beam, // and nBestKeys for each their original location (batchIdx, beamHypIdx, word). diff --git a/src/translator/beam_search.h b/src/translator/beam_search.h index e2de7d243..75a9caeb0 100644 --- a/src/translator/beam_search.h +++ b/src/translator/beam_search.h @@ -3,6 +3,7 @@ #include "marian.h" #include "translator/history.h" #include "translator/scorers.h" +#include "translator/nth_element.h" namespace marian { diff --git a/src/translator/sampling.h b/src/translator/sampling.h new file mode 100644 index 000000000..4ac2063e9 --- /dev/null +++ b/src/translator/sampling.h @@ -0,0 +1,141 @@ + namespace marian { + + class DistModifier { + private: + Ptr options_; + bool forceDecode_{false}; + bool sampling_{false}; + std::string samplingMethod_; + int topk_{10}; + float temperature_{1.f}; + + Ptr batch_; + float invalidPathScore_; + + Expr forceBatch_; + + public: + DistModifier(Ptr options, Ptr batch, float invalidPathScore) : + options_(options), forceDecode_(options_->get("force-decode", false)), + batch_(batch), invalidPathScore_(invalidPathScore) { + + if(options_->hasAndNotEmpty("output-sampling")) { + sampling_ = true; + auto samplingOpts = options_->get>("output-sampling", {}); + samplingMethod_ = samplingOpts.size() > 0 ? samplingOpts[0] : "full"; + if(samplingMethod_ == "0") { // for backcompat with boolean values + sampling_ = false; + samplingMethod_ = ""; + } else if(samplingMethod_ == "1") { // for backcompat with boolean values + sampling_ = true; + samplingMethod_ = "full"; + } + + if(samplingMethod_ == "full") { + if(samplingOpts.size() > 1) + temperature_ = std::stof(samplingOpts[1]); + } + + if(samplingMethod_ == "topk") { + if(samplingOpts.size() > 1) + topk_ = std::stoi(samplingOpts[1]); + if(samplingOpts.size() > 2) + temperature_ = std::stof(samplingOpts[2]); + } + } + } + + Expr force(Expr scores, int pos, int beamSize, std::vector& batchIndices) { + // we check the last field of the batch for force-decoding content + int dimTime = (int)batch_->back()->batchWidth(); + if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores + return scores; + + LOG_ONCE(info, "Force-decoding with given prefixes"); + // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the + // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections + // which preserves the original tensor layout. + // This allows for beam-search and batched force-decoding with different length prefixes in a batch + // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search + // which then continues as normal on the modified distribution. + + if(!forceBatch_) { + // turn the batch into a cached tensor that lives in the computation graph + std::vector forceWords; + for(auto& word : batch_->back()->data()) + forceWords.push_back(word.toWordIndex()); + + int dimBatch = (int)batch_->back()->batchSize(); + forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1] + } + + // if we remove batch entries during decoding (finished decoding) then adjust here + if(forceBatch_->shape()[-2] != batchIndices.size()) + forceBatch_ = index_select(forceBatch_, -2, batchIndices); + + // get vocab index and probability for force-decoded tokens for the current time step + Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] + Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] + + // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam + // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch + // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion. + int dimVocab = scores->shape()[-1]; + auto graph = scores->graph(); + // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred. + Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f); + // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself. + Expr dummyVals = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f); + + // here we add the force-decoded entries back into the zeroed positions + dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); + dummyVals = dummyVals + forceVals; + + // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and + // dummy values into the correct positions. + Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_)); + forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); + + // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry + // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means + // free decoding or sampling. + WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex(); + auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId); + return interpol * scores + (1.f - interpol) * forcedScores; + } + + Expr sample(Expr scores) { + if(sampling_) { + if(temperature_ != 1.f) + scores = scores / temperature_; + + if(samplingMethod_ == "full") { + LOG_ONCE(info, "Output sampling from the full softmax distribution with temperature {}", temperature_); + return logsoftmax(scores + constant_like(scores, inits::gumbel())); + } else if(samplingMethod_ == "topk") { + if(topk_ == 1) + LOG_ONCE(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); + LOG_ONCE(info, "Output sampling via top-{} sampling with temperature {}", topk_, temperature_); + + Expr invalidLogits = constant_like(scores, inits::fromValue(invalidPathScore_)); + + // select top-k values + Expr val, idx; + std::tie(val, idx) = topk(scores, topk_, /*axis=*/-1, /*descending=*/true); + + // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search + Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); + + // Scatter gumbelled values back into logits to fill with usable values + return scatter(invalidLogits, -1, idx, gumbelVal); + } else { + ABORT("Unknown sampling method: {}", samplingMethod_); + } + } else { // no sampling + return scores; + } + } + + }; + + } \ No newline at end of file From 7d2045a9072b7e4b0afef7e80473e95949298ae1 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 21 Sep 2022 20:39:54 +0000 Subject: [PATCH 197/254] Merged PR 25686: Loading checkpoints from main node only via MPI Enables loading of model checkpoints from main node only via MPI. Until now the checkpoint needed to present in the same location on all nodes. That could be done either via writing to a shared filesystem (problematic due to bad syncing) or by manual copying to the same local location, e.g. /tmp on each node (while writing only happened to one main location). Now, marian can resume training from only one location on the main node. The remaining nodes do not need to have access. E.g. local /tmp on the main node can be used, or race conditons on shared storage are avoided. Also avoids creating files for logging on more than one node. This is a bit wonky, done via environment variable lookup. --- CHANGELOG.md | 3 ++ VERSION | 2 +- src/common/cli_helper.h | 18 --------- src/common/file_stream.cpp | 11 ++++- src/common/file_stream.h | 3 +- src/common/logging.cpp | 13 ++++-- src/common/utils.cpp | 12 +++++- src/common/utils.h | 3 ++ src/embedder/embedder.h | 10 ----- src/examples/mnist/model.h | 4 ++ src/models/costs.h | 12 ++++++ src/models/encoder_classifier.h | 6 +++ src/models/encoder_decoder.h | 3 +- src/models/encoder_pooler.h | 11 +++++ src/models/model_base.h | 13 ++++++ src/rescorer/rescorer.h | 10 ----- src/training/communicator.cpp | 61 +++++++++++++++++++++++++++- src/training/communicator.h | 26 +++--------- src/training/graph_group.cpp | 72 ++++++++++++++++++++++++--------- src/training/scheduler.h | 32 +++++++-------- src/training/training_state.h | 17 +++++--- src/translator/translator.h | 9 ----- 22 files changed, 232 insertions(+), 119 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01aea0251..23008b734 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- `--force-decode` option for marian-decoder +- `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) ### Fixed +- Read/restore checkpoints from main process only when training with MPI - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) - Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) - During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. diff --git a/VERSION b/VERSION index 77418c859..316ba050f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.7 +v1.11.9 diff --git a/src/common/cli_helper.h b/src/common/cli_helper.h index dc8eafdf6..44236e0d2 100644 --- a/src/common/cli_helper.h +++ b/src/common/cli_helper.h @@ -19,24 +19,6 @@ static inline std::string interpolateEnvVars(std::string str) { return str; } -#if 1 - if(getenv("PHILLY_JOB_ID")) { - const char* cluster = getenv("PHILLY_CLUSTER"); - const char* vc = getenv("PHILLY_VC"); - // this environment variable exists when running on the cluster - if(cluster && vc) { - static const std::string s_gfsPrefix - = std::string("/gfs/") + cluster + "/" + vc + "/"; - static const std::string s_hdfsPrefix - = std::string("/hdfs/") + cluster + "/" + vc + "/"; - if(str.find(s_gfsPrefix) == 0) - str = std::string("/hdfs/") + vc + "/" + str.substr(s_gfsPrefix.size()); - else if(str.find(s_hdfsPrefix) == 0) - str = std::string("/hdfs/") + vc + "/" - + str.substr(s_hdfsPrefix.size()); - } - } -#endif for(;;) { const auto pos = str.find("${"); if(pos == std::string::npos) diff --git a/src/common/file_stream.cpp b/src/common/file_stream.cpp index 14da7c9b4..e1572f62e 100644 --- a/src/common/file_stream.cpp +++ b/src/common/file_stream.cpp @@ -74,6 +74,12 @@ std::string InputFileStream::getFileName() const { return file_.string(); } +std::string InputFileStream::readToString() const { + std::stringstream ss; + ss << this->rdbuf(); + return ss.str(); +} + // wrapper around std::getline() that handles Windows input files with extra CR // chars at the line end std::istream &getline(std::istream &in, std::string &line) { @@ -85,6 +91,7 @@ std::istream &getline(std::istream &in, std::string &line) { line.pop_back(); return in; } + /////////////////////////////////////////////////////////////////////////////////////////////// OutputFileStream::OutputFileStream(const std::string &file) : std::ostream(NULL), file_(file) { @@ -119,7 +126,7 @@ TemporaryFile::TemporaryFile(const std::string &base, bool earlyUnlink) NormalizeTempPrefix(baseTemp); MakeTemp(baseTemp); - inSteam_ = UPtr(new io::InputFileStream(file_.string())); + inStream_ = UPtr(new io::InputFileStream(file_.string())); if(unlink_) { ABORT_IF(remove(file_.string().c_str()), "Error while deleting '{}'", file_.string()); } @@ -190,7 +197,7 @@ void TemporaryFile::MakeTemp(const std::string &base) { } UPtr TemporaryFile::getInputStream() { - return std::move(inSteam_); + return std::move(inStream_); } std::string TemporaryFile::getFileName() const { diff --git a/src/common/file_stream.h b/src/common/file_stream.h index ccf33ed86..be867b743 100644 --- a/src/common/file_stream.h +++ b/src/common/file_stream.h @@ -46,6 +46,7 @@ class InputFileStream : public std::istream { bool empty(); void setbufsize(size_t size); std::string getFileName() const; + std::string readToString() const; protected: marian::filesystem::Path file_; @@ -92,7 +93,7 @@ class TemporaryFile : public OutputFileStream { protected: bool unlink_; - UPtr inSteam_; + UPtr inStream_; void NormalizeTempPrefix(std::string& base) const; void MakeTemp(const std::string& base); diff --git a/src/common/logging.cpp b/src/common/logging.cpp index f77a41df6..69efeb482 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -1,5 +1,7 @@ #include "logging.h" #include "common/config.h" +#include "common/utils.h" + #include "spdlog/sinks/null_sink.h" #include "3rd_party/ExceptionWithCallStack.h" #include @@ -30,9 +32,14 @@ std::shared_ptr createStderrLogger(const std::string& name, if(!quiet) sinks.push_back(stderr_sink); - for(auto&& file : files) { - auto file_sink = std::make_shared(file, true); - sinks.push_back(file_sink); + // @TODO: think how to solve this better than using OMPI_COMM_WORLD_RANK env variable + // only create output files if we are the main process or if MPI rank is not defined + int rank = marian::utils::getMPIRankEnv(); // this function looks up OMPI_COMM_WORLD_RANK env variable + if(rank == 0) { + for(auto&& file : files) { + auto file_sink = std::make_shared(file, true); + sinks.push_back(file_sink); + } } auto logger = std::make_shared(name, begin(sinks), end(sinks)); diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 99fc790a2..1f3fd6c07 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -180,7 +180,8 @@ std::string exec(const std::string& cmd, const std::vector& args /* std::pair hostnameAndProcessId() { // helper to get hostname:pid #ifdef _WIN32 - std::string hostname = getenv("COMPUTERNAME"); + const char* res = getenv("COMPUTERNAME"); + std::string hostname = res ? std::string(res) : ""; auto processId = (int)GetCurrentProcessId(); #else static std::string hostname = []() { // not sure if gethostname() is expensive. This way we call it only once. @@ -193,6 +194,15 @@ std::pair hostnameAndProcessId() { // helper to get hostname: return {hostname, processId}; } +// returns MPI rank from environment variable if set, otherwise 0 +int getMPIRankEnv() { + const char* rank = getenv("OMPI_COMM_WORLD_RANK"); + if(rank) + return std::atoi(rank); + else + return 0; +} + // format a long number with comma separators std::string withCommas(size_t n) { std::string res = std::to_string(n); diff --git a/src/common/utils.h b/src/common/utils.h index 13b50c0bd..fbcf672d7 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -43,6 +43,9 @@ std::string exec(const std::string& cmd, const std::vector& args = std::pair hostnameAndProcessId(); +// returns MPI rank from environment variable if set, otherwise 0 +int getMPIRankEnv(); + std::string withCommas(size_t n); bool beginsWith(const std::string& text, const std::string& prefix); bool endsWith(const std::string& text, const std::string& suffix); diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h index 36b3df444..d45e14cd3 100644 --- a/src/embedder/embedder.h +++ b/src/embedder/embedder.h @@ -141,16 +141,6 @@ class Embed : public ModelTask { output->Write((long)batch->getSentenceIds()[i], sentVector); } - - // progress heartbeat for MS-internal Philly compute cluster - // otherwise this job may be killed prematurely if no log for 4 hrs - if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster - && id % 1000 == 0) // hard beat once every 1000 batches - { - auto progress = id / 10000.f; //fake progress for now, becomes >100 after 1M batches - fprintf(stderr, "PROGRESS: %.2f%%\n", progress); - fflush(stderr); - } }; pool.enqueue(task, batchId++); diff --git a/src/examples/mnist/model.h b/src/examples/mnist/model.h index f7af16811..5d50eae96 100755 --- a/src/examples/mnist/model.h +++ b/src/examples/mnist/model.h @@ -75,6 +75,10 @@ class MnistFeedForwardNet : public IModel { return Logits(apply(graph, batch, inference_)); } + void load(Ptr /*graph*/, const std::vector& /*items*/, bool) override { + LOG(critical, "Loading MNIST model is not supported"); + } + void load(Ptr /*graph*/, const std::string& /*name*/, bool) override { LOG(critical, "Loading MNIST model is not supported"); } diff --git a/src/models/costs.h b/src/models/costs.h index f1c9931a3..45527362f 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -217,6 +217,12 @@ class Trainer : public ICriterionFunction { Ptr getModel() { return model_; } + void load(Ptr graph, + const std::vector& items, + bool markedReloaded) override { + model_->load(graph, items, markedReloaded); + } + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override { @@ -263,6 +269,12 @@ class Scorer : public IModel { Ptr getModel() { return model_; } + virtual void load(Ptr graph, + const std::vector& items, + bool markReloaded = true) override { + model_->load(graph, items, markReloaded); + } + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override { diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h index bb8d28564..265bdacbd 100644 --- a/src/models/encoder_classifier.h +++ b/src/models/encoder_classifier.h @@ -152,6 +152,12 @@ class EncoderClassifier : public EncoderClassifierBase { void push_back(Ptr encoder) { encoders_.push_back(encoder); } void push_back(Ptr classifier) { classifiers_.push_back(classifier); } + void load(Ptr graph, + const std::vector& items, + bool markedReloaded) override { + graph->load(items, markedReloaded && !opt("ignore-model-config", false)); + } + void load(Ptr graph, const std::string& name, bool markedReloaded) override { diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h index 0fbf3fafe..4ccc6a93f 100644 --- a/src/models/encoder_decoder.h +++ b/src/models/encoder_decoder.h @@ -15,7 +15,8 @@ class IEncoderDecoder : public models::IModel { virtual void load(Ptr graph, const std::vector& items, - bool markedReloaded = true) = 0; + bool markedReloaded = true) override + = 0; virtual void load(Ptr graph, const std::string& name, diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index 8a2123430..7bd17c41a 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -25,6 +25,11 @@ class EncoderPoolerBase : public models::IModel { public: virtual ~EncoderPoolerBase() {} + virtual void load(Ptr graph, + const std::vector& items, + bool markedReloaded = true) override + = 0; + virtual void load(Ptr graph, const std::string& name, bool markedReloaded = true) override @@ -162,6 +167,12 @@ class EncoderPooler : public EncoderPoolerBase { void push_back(Ptr encoder) { encoders_.push_back(encoder); } void push_back(Ptr pooler) { poolers_.push_back(pooler); } + void load(Ptr graph, + const std::vector& items, + bool markedReloaded) override { + graph->load(items, markedReloaded && !opt("ignore-model-config", false)); + } + void load(Ptr graph, const std::string& name, bool markedReloaded) override { diff --git a/src/models/model_base.h b/src/models/model_base.h index 09f3b7340..6a327968a 100644 --- a/src/models/model_base.h +++ b/src/models/model_base.h @@ -2,6 +2,7 @@ #include #include "marian.h" +#include "common/io_item.h" #include "layers/loss.h" #include "layers/generic.h" @@ -24,6 +25,12 @@ class IModel { const std::string&, bool markReloaded = true) = 0; + + virtual void load(Ptr, + const std::vector&, + bool markReloaded = true) + = 0; + virtual void save(Ptr, const std::string&, bool saveTranslatorConfig = false) @@ -47,6 +54,12 @@ class ICriterionFunction { const std::string&, bool markReloaded = true) = 0; + + virtual void load(Ptr, + const std::vector&, + bool markReloaded = true) + = 0; + virtual void save(Ptr, const std::string&, bool saveTranslatorConfig = false) diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index 26d74917e..062b91bca 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -201,16 +201,6 @@ class Rescore : public ModelTask { } } } - - // progress heartbeat for MS-internal Philly compute cluster - // otherwise this job may be killed prematurely if no log for 4 hrs - if (getenv("PHILLY_JOB_ID") // this environment variable exists when running on the cluster - && id % 1000 == 0) // hard beat once every 1000 batches - { - auto progress = id / 10000.f; //fake progress for now, becomes >100 after 1M batches - fprintf(stdout, "PROGRESS: %.2f%%\n", progress); - fflush(stdout); - } }; pool.enqueue(task, batchId++); diff --git a/src/training/communicator.cpp b/src/training/communicator.cpp index 602f7daa7..0e5881e5f 100644 --- a/src/training/communicator.cpp +++ b/src/training/communicator.cpp @@ -134,11 +134,12 @@ class MPIWrapper : public IMPIWrapper // get the limit for int count size_t limit = (size_t)std::numeric_limits::max(); - size_t remaining = count, offset = 0; + size_t remaining = count; + size_t offset = 0; // while there are elements that we have not sent yet, loop until all has been sent in chunks of at most `limit`. while(remaining > 0) { - int intCount = (int)std::min(remaining, limit); + int intCount = (int)std::min(remaining, limit); HANDLE_MPI_ERROR(MPI_Bcast((char*)buf + offset * (size_t)datatypeSize, intCount, datatype, (int)rootRank, comm)); offset += (size_t)intCount; remaining -= (size_t)intCount; @@ -193,6 +194,49 @@ class MPIWrapper : public IMPIWrapper virtual void finalize() override { HANDLE_MPI_ERROR(MPI_Finalize()); } + + virtual void bCast(io::Item& item, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + if(isMainProcess()) + ABORT_IF(item.bytes.empty(), "Broadcasting empty item via MPI should not happen. Please report."); + + unsigned long long bytesLen = item.bytes.size(); + bCast(&bytesLen, 1, getDataType(&bytesLen), rootRank, comm); + + item.bytes.resize(bytesLen); + bCast(item.bytes.data(), bytesLen, getDataType(item.bytes.data()), rootRank, comm); + + unsigned long long shapeLen = item.shape.size(); + bCast(&shapeLen, 1, getDataType(&shapeLen), rootRank, comm); + item.shape.resize(shapeLen); + bCast(item.shape.data(), shapeLen, getDataType(item.shape.data()), rootRank, comm); + + bCast(item.name, rootRank, comm); + + size_t type = (size_t)item.type; + bCast(&type, 1, getDataType(&type), rootRank, comm); + item.type = (Type)type; + } + + virtual void bCast(std::vector& items, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + size_t numItems = 0; + if(isMainProcess()) + numItems = items.size(); + + bCast(&numItems, 1, getDataType(&numItems), rootRank, comm); + items.resize(numItems); + for(auto& item : items) + bCast(item, rootRank, comm); + } + + virtual void bCast(std::string& str, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + size_t length = 0; + if(isMainProcess()) + length = str.size(); + + bCast(&length, 1, getDataType(&length), rootRank, comm); + str.resize(length); + bCast(str.data(), length, getDataType(str.data()), rootRank, comm); + } }; #endif @@ -232,6 +276,19 @@ class FakeMPIWrapper : public IMPIWrapper // to only accept one parameter, and remove this error check can be removed. ABORT_IF(sendbuf != recvbuf, "FakeMPIWrapper::allReduce() only implemented for in-place operation"); // otherwise it's not a no-op, we must copy data } + + virtual void bCast(io::Item& item, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + item; rootRank; comm; + } + + virtual void bCast(std::vector& items, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + items; rootRank; comm; + } + + virtual void bCast(std::string& str, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const override { + str; rootRank; comm; + } + #pragma warning(pop) virtual void finalize() override { } }; diff --git a/src/training/communicator.h b/src/training/communicator.h index 5ab1b6b27..207c38810 100644 --- a/src/training/communicator.h +++ b/src/training/communicator.h @@ -68,7 +68,7 @@ class ICommunicator { #if MPI_FOUND #else enum MPI_Comm { MPI_COMM_WORLD }; -enum MPI_Datatype { MPI_FLOAT, MPI_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG, MPI_BYTE, MPI_INT }; +enum MPI_Datatype { MPI_FLOAT, MPI_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG, MPI_BYTE, MPI_INT, MPI_CXX_BOOL }; enum MPI_Op { MPI_SUM }; struct MPI_Status { int MPI_SOURCE; }; #define MPI_ANY_SOURCE ((size_t)-2) @@ -88,30 +88,16 @@ struct/*interface*/ IMPIWrapper { static const size_t RECV_ANY_SOURCE = (size_t)MPI_ANY_SOURCE; static MPI_Datatype getDataType(const char*) { return MPI_BYTE; } + static MPI_Datatype getDataType(const bool*) { return MPI_CXX_BOOL; } static MPI_Datatype getDataType(const int*) { return MPI_INT; } static MPI_Datatype getDataType(const float*) { return MPI_FLOAT; } static MPI_Datatype getDataType(const unsigned long*) { return MPI_UNSIGNED_LONG; } static MPI_Datatype getDataType(const unsigned long long*) { return MPI_UNSIGNED_LONG_LONG; } - void bCast(io::Item& item, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) { - ABORT_IF(item.bytes.empty(), "Broadcasting empty item via MPI??"); - - unsigned long long bytesLen = item.bytes.size(); - bCast(&bytesLen, 1, getDataType(&bytesLen), rootRank, comm); - - item.bytes.resize(bytesLen); - bCast(item.bytes.data(), item.bytes.size(), getDataType(item.bytes.data()), rootRank, comm); - - unsigned long long shapeLen = item.shape.size(); - bCast(&shapeLen, 1, getDataType(&shapeLen), rootRank, comm); - - bCast(item.shape.data(), item.shape.size(), getDataType(item.shape.data()), rootRank, comm); - - size_t type = (size_t)item.type; - bCast(&type, 1, getDataType(&type), rootRank, comm); - item.type = (Type)type; - } - + virtual void bCast(io::Item& item, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const = 0; + virtual void bCast(std::vector& items, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const = 0; + virtual void bCast(std::string& str, size_t rootRank = 0, MPI_Comm comm = MPI_COMM_WORLD) const = 0; + std::string idStr() const; }; diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 4d92b1c9c..d9a77a708 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -283,25 +283,51 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { */ if(!options_->get("no-reload")) { std::string modelFileName = options_->get("model"); + bool foundModel = false; + + // these are structures that get fill in the main process and then broadcasted to other MPI + std::vector items; + bool markReloaded = true; + + if(isMainProcess()) { + if(filesystem::exists(modelFileName)) { + LOG(info, "Loading model from {}", modelFileName); + foundModel = true; + items = io::loadItems(modelFileName); + markReloaded = true; + } else if(options_->hasAndNotEmpty("pretrained-model")) { + std::string pretrainedModelFileName = options_->get("pretrained-model"); + LOG(info, "[training] Initializing model weights with pre-trained model {}", pretrainedModelFileName); + foundModel = true; + items = io::loadItems(pretrainedModelFileName); + markReloaded = false; + } + } - if(filesystem::exists(modelFileName)) { + // if a model file exists, the main process will find it and propagate this information to other MPI nodes + if(mpi_) + mpi_->bCast(&foundModel, 1, mpi_->getDataType(&foundModel)); + + if(foundModel) { + // continue with checkpoint loading + if(mpi_) { + // broadcast model information to other processes + mpi_->bCast(items); + mpi_->bCast(&markReloaded, 1, mpi_->getDataType(&markReloaded)); + } + + // handles MPI if(scheduler_) scheduler_->load(modelFileName); + // we just load it N times from disk (it'll be in disk cache after the first) // this also allocates memory correctly when calling forward() inside restoreFromCheckPoint size_t i = 0; for(auto graph : graphs_) - models_[i++]->load(graph, modelFileName); + models_[i++]->load(graph, items, markReloaded); // try to restore everything from checkpoint now restoreFromCheckpoint(modelFileName, scatterFn); - } else if(options_->hasAndNotEmpty("pretrained-model")) { - std::string nameInit = options_->get("pretrained-model"); - LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit); - - size_t i = 0; - for(auto graph : graphs_) - models_[i++]->load(graph, nameInit, false); } } } @@ -316,19 +342,26 @@ bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName, std::string checkpointName = modelFileName + ".optimizer.npz"; // @TODO: change to .checkpoint.npz, would break backwards compat - if(!filesystem::exists(checkpointName)) { + // if a checkpoint exists, the main process will find it and propagate this information to other MPI nodes + bool foundCheckpoint = filesystem::exists(checkpointName); + if(mpi_) + mpi_->bCast(&foundCheckpoint, 1, mpi_->getDataType(&foundCheckpoint)); + + // all nodes will either continue or exit + if(!foundCheckpoint) { LOG(warn, "No checkpoint found, parameters reloaded from last inference model"); return false; // failed to restore } - auto items = io::loadItems(checkpointName); - - // make sure all nodes see the same checkpoint data, may not be the case with distributed file systems - // when there was a delay in updating the caches accross nodes. So here node 0 sends its data to all. - // We still load them all from disk, but that serves more as a trick to allocate the correct memory. - if(mpi_) - for(auto& item : items) - mpi_->bCast(item); + std::vector items; + // make sure all nodes receive the same checkpoint data from the main process. + if(mpi_) { // only the main process loads the checkpoint and the rest receives a copy + if(isMainProcess()) + items = io::loadItems(checkpointName); + mpi_->bCast(items); + } else { // not doing MPI, so just load the checkpoint from disk + items = io::loadItems(checkpointName); + } // @TODO: probably we want to have the list of DeviceIds as an attribute std::vector> backends; @@ -351,7 +384,8 @@ bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName, // run a full forward pass over the paramters to allocate the parameters values in order (by parameter name). // Just doing graph->params()->allocateForward() is not sufficient. ABORT_IF(graph->params()->vals()->shape() != masterParameters.shape, - "Graph parameter sizes and master copy parameter sizes in checkpoint do not match"); + "Graph parameter sizes and master copy parameter sizes in checkpoint do not match ({} != {})", + graph->params()->vals()->shape(), masterParameters.shape); // Convert type of io::Item to match graph parameter type. if(masterParameters.type != graph->params()->vals()->type()) diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 3cc3b2076..34aa18c21 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -478,24 +478,11 @@ class Scheduler : public TrainingObserver { state_->samplesDisp = 0; state_->wordsDisp = 0; } - - // progress heartbeat for MS-internal Philly compute cluster - // This environment variable exists when running on the cluster. - using namespace std::chrono; - if((!mpi_ || mpi_->myMPIRank() == 0) && getenv("PHILLY_JOB_ID") - && heartBeatTimer_.elapsed() >= 30) { - fprintf(stderr, "PROGRESS: %.2f%%\nEVALERR: %.7f%%\n", - (double)calculateLogicalEpoch(), - state_->costSum / (state_->costCount ? state_->costCount : 1)); - fflush(stderr); - heartBeatTimer_.start(); - } } - void load(const std::string& name) { - std::string nameYaml = name + ".progress.yml"; - if(filesystem::exists(nameYaml)) - state_->load(nameYaml); + void loadFromString(const std::string yamlString) { + if(!yamlString.empty()) + state_->loadFromString(yamlString); if(options_->get("no-restore-corpus")) { state_->samplesEpoch = 0; @@ -519,6 +506,19 @@ class Scheduler : public TrainingObserver { state_->newLoad(); } + void load(const std::string& name) { + std::string nameYaml = name + ".progress.yml"; + std::string yamlStr; + if(mpi_->isMainProcess()) + if(filesystem::exists(nameYaml)) + yamlStr = io::InputFileStream(nameYaml).readToString(); + + if(mpi_) + mpi_->bCast(yamlStr); + + loadFromString(yamlStr); + } + void save(const std::string& name) { // Save config options std::ofstream fout(name + ".yml"); diff --git a/src/training/training_state.h b/src/training/training_state.h index ce0895a24..2fb9209fa 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -1,11 +1,12 @@ #pragma once #include "common/definitions.h" +#include "common/file_stream.h" #include "common/filesystem.h" #include "common/scheduling_parameter.h" #include "common/utils.h" -#include +#include #include namespace marian { @@ -194,11 +195,8 @@ class TrainingState { } } - void load(const std::string& name) { - if(!filesystem::exists(name)) - return; - - YAML::Node config = YAML::LoadFile(name); + void loadFromString(const std::string& yamlString) { + YAML::Node config = YAML::Load(yamlString); epochs = config["epochs"].as(); batches = config["batches"].as(); @@ -242,6 +240,13 @@ class TrainingState { seedCorpus = config["seed-corpus"].as(); } + void load(const std::string& name) { + if(!filesystem::exists(name)) + return; + + loadFromString(io::InputFileStream(name).readToString()); + } + void save(const std::string& name) const { std::ofstream fout(name); YAML::Node config; diff --git a/src/translator/translator.h b/src/translator/translator.h index 3103e7ddc..205c213cb 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -165,15 +165,6 @@ class Translate : public ModelTask { // abort early to avoid potentially costly batching and translation before error message ABORT_IF(statFreq.unit != SchedulingUnit::updates, "Units other than 'u' are not supported for --stat-freq value {}", statFreq); - // Override display for progress heartbeat for MS-internal Philly compute cluster - // otherwise this job may be killed prematurely if no log for 4 hrs - if(getenv("PHILLY_JOB_ID")) { // this environment variable exists when running on the cluster - if(statFreq.n == 0) { - statFreq.n = 10000; - statFreq.unit = SchedulingUnit::updates; - } - } - bool doNbest = options_->get("n-best"); bg.prepare(); From cfc33f54984c64260a70c826f04e0d42955d3d81 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 22 Sep 2022 15:11:33 -0700 Subject: [PATCH 198/254] only use tcmalloc_minimal --- CMakeLists.txt | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dbad75cb5..3c674e68d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -491,15 +491,21 @@ if(USE_STATIC_LIBS) endif() ############################################################################### -# Find Tcmalloc +# Find Tcmalloc_minimal +# re-used from sentencepiece if(NOT WIN32) - find_package(Tcmalloc) - if(Tcmalloc_FOUND) - include_directories(${Tcmalloc_INCLUDE_DIR}) + if(USE_STATIC_LIBS) + find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a) + else() + find_library(TCMALLOC_LIB NAMES tcmalloc_minimal) + endif() + if (TCMALLOC_LIB) + message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}") set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES}) - else(Tcmalloc_FOUND) - message(WARNING "Cannot find TCMalloc library. Continuing.") - endif(Tcmalloc_FOUND) + add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free) + else() + message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}") + endif() endif() ############################################################################### From 1f2929d528b403748548605873dcf4f2d28c7ec0 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 26 Sep 2022 20:17:33 +0000 Subject: [PATCH 199/254] Merged PR 25733: Fused inplace ReLU and Dropout in transformer FFN layer * First attempt at fused inplace ReLU and Dropout in transformer FFN layer * Adds optional output projection to SSRU. For large FFN blocks and dropout about 20-25% speed improvement during training. --- src/common/config_parser.cpp | 2 ++ src/graph/expression_operators.cpp | 28 +++++++++++++++----- src/graph/expression_operators.h | 13 +++++----- src/graph/node_operators_binary.h | 21 +++++++-------- src/graph/node_operators_unary.h | 41 ++++++++++++++++++++++++++++++ src/layers/generic.h | 8 ++++-- src/models/encoder_classifier.h | 1 + src/models/encoder_decoder.cpp | 1 + src/models/encoder_pooler.h | 1 + src/models/transformer.h | 16 +++++++++--- src/tensors/gpu/element.inc | 3 +++ src/tests/dropout.cpp | 8 ++---- src/tests/units/operator_tests.cpp | 2 +- 13 files changed, 109 insertions(+), 36 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 338933d9f..c9ab45f81 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -263,6 +263,8 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { 8); cli.add("--transformer-no-projection", "Omit linear projection after multi-head attention (transformer)"); + cli.add("--transformer-rnn-projection", + "Add linear projection after rnn layer (transformer)"); cli.add("--transformer-pool", "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index ca5e68054..09049f98f 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -676,13 +676,29 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { } } -Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { - auto graph = a->graph(); +// @TODO: unify all these +Expr affineWithReluDropout(Expr x, Expr W, Expr bias, float dropProb) { + auto graph = x->graph(); + if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu) { + // not doing any dropout in inference mode + return Expression(x, W, bias); + } else { + Expr output = affine(x, W, bias); + int dimModel = output->shape()[-1]; + int dimTime = output->shape()[-2]; + output = dropoutReluInplace(output, dropProb, {dimTime, dimModel}); + return output; + } +} - if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu) - return Expression(a, b, bias, transA, transB, scale); - else - return relu(affine(a, b, bias, transA, transB, scale)); +Expr dropoutReluInplace(Expr x, float dropProb, Shape shape) { + if(dropProb == 0) { + return relu(x); + } else { + auto graph = x->graph(); + auto mask = graph->dropoutMask(dropProb, shape); + return Expression(x, mask); + } } // @TODO: Not a great place to check this diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 1e98047f9..5d9ceab36 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -493,12 +493,10 @@ Expr affine(Expr a, /** * As above, but efficiently applies relu transformation to output. For inference only. */ -Expr affineWithRelu(Expr a, - Expr b, - Expr bias, - bool transA = false, - bool transB = false, - float scalar = 1.f); +Expr affineWithReluDropout(Expr a, + Expr b, + Expr bias, + float dropProb = 0.f); /** * Computes the dot product of CSR-tensor @p A with @p B. @@ -971,6 +969,7 @@ static inline Expr dropout(Expr x, float dropProb, Shape shape) { return dropout(x, mask); } + /** * Performs dropout with a given probably. */ @@ -980,6 +979,8 @@ static inline Expr dropout(Expr x, float dropProb) { return dropout(x, dropProb, x->shape()); } +Expr dropoutReluInplace(Expr x, float dropProb, Shape shape); + /** * Shifts the elements of an expression by a per-axis offset @p shift * padded with @p padValue. diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 7a4824ef0..292554bd0 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -431,16 +431,13 @@ class AffineWithReluNodeOp : public NaryNodeOp { public: AffineWithReluNodeOp(Expr a, Expr b, - Expr bias, - bool transA, - bool transB, - float scalar) - : NaryNodeOp({a, b, bias}, newShape(a, b, transA, transB)), - transA_(transA), - transB_(transB), - scalar_(scalar) { - ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu, - "AffineWithReluNodeOp currently only supported for inference on GPU"); + Expr bias) + : NaryNodeOp({a, b, bias}, newShape(a, b, false, false)), + transA_(false), + transB_(false), + scalar_(1.0) { + ABORT_IF(!graph()->isInference(), + "AffineWithReluNodeOp currently only supported for inference"); } Shape newShape(Expr a, Expr b, bool transA, bool transB) { @@ -464,8 +461,8 @@ class AffineWithReluNodeOp : public NaryNodeOp { } NodeOps forwardOps() override { - ABORT_IF(!graph()->isInference() || graph()->getDeviceId().type != DeviceType::gpu, - "AffineWithReluNodeOp currently only supported for inference on GPU"); + ABORT_IF(!graph()->isInference(), + "AffineWithReluNodeOp currently only supported for inference"); return { NodeOp(Affine(val_, diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 448b4c4a4..27121fa6d 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -858,6 +858,8 @@ class ReshapeNodeOp : public UnaryNodeOp { } }; + + // @TODO: add version with access to backward step // This allows to attach a lambda function to any node during the execution. It is a non-operation otherwise // i.e. doesn't consume any memory or take any time to execute (it's a reshape onto itself) other than the @@ -897,6 +899,45 @@ class CallbackNodeOp : public ReshapeNodeOp { } }; +class DropoutReluInplaceNodeOp : public ReshapeNodeOp { +private: + Expr mask_; + +public: + DropoutReluInplaceNodeOp(Expr node, Expr mask) + : ReshapeNodeOp(node, node->shape()), + mask_(mask) {} + + void forward() override { + using namespace marian::functional; + Element(_1 = ReLU(_1 * _2), val(), mask_->val()); + } + + void backward() override { + using namespace marian::functional; + Element(_1 = _1 * ReLUback(_2) * _3, grad(), val(), mask_->val()); + } + + const std::string type() override { return "dropoutReluInplace"; } + + virtual size_t hash() override { + size_t seed = ReshapeNodeOp::hash(); + util::hash_combine(seed, mask_->hash()); + return seed; + } + + virtual bool equal(Expr node) override { + if(!ReshapeNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(mask_ != cnode->mask_) + return false; + return true; + } +}; + // @TODO: review if still required as this is an ugly hack anyway. // Memory less operator that clips gradients during backward step // Executes this as an additional operation on the gradient. diff --git a/src/layers/generic.h b/src/layers/generic.h index b423befeb..df11a2337 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -234,12 +234,16 @@ static inline Expr denseInline(Expr x, auto b = graph->param(prefix + "_b" + suffix, {1, outDim}, inits::zeros()); if(actName == "relu") { - x = affineWithRelu(x, W, b); // speed optimization for inference, @TODO: handle better in future layer framework + x = affineWithReluDropout(x, W, b, dropProb); // fused operator for transformer FFN } else { x = affine(x, W, b); x = activationByName(actName)(x); + + int dimModel = x->shape()[-1]; + int dimTime = x->shape()[-2]; + x = dropout(x, dropProb, {dimTime, dimModel}); } - x = dropout(x, dropProb); // @TODO: check for infernce? + return x; } diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h index 265bdacbd..552e428f2 100644 --- a/src/models/encoder_classifier.h +++ b/src/models/encoder_classifier.h @@ -116,6 +116,7 @@ class EncoderClassifier : public EncoderClassifierBase { modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); + modelFeatures_.insert("transformer-rnn-projection"); modelFeatures_.insert("transformer-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); modelFeatures_.insert("transformer-ffn-activation"); diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index a6f4dd3dc..6a298ed0d 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -37,6 +37,7 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); + modelFeatures_.insert("transformer-rnn-projection"); modelFeatures_.insert("transformer-dim-ffn"); modelFeatures_.insert("transformer-decoder-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index 7bd17c41a..124d873c5 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -130,6 +130,7 @@ class EncoderPooler : public EncoderPoolerBase { modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); + modelFeatures_.insert("transformer-rnn-projection"); modelFeatures_.insert("transformer-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); modelFeatures_.insert("transformer-ffn-activation"); diff --git a/src/models/transformer.h b/src/models/transformer.h index d87594e0e..243d2c7fc 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -170,8 +170,11 @@ class Transformer : public EncoderOrDecoderBase { auto output = input; for(auto op : ops) { // dropout - if (op == 'd') - output = dropout(output, dropProb); + if (op == 'd') { + int dimModel = output->shape()[-1]; + int dimTime = output->shape()[-2]; + output = dropout(output, dropProb, {dimTime, dimModel}); + } // layer normalization else if (op == 'n') output = layerNorm(output, prefix, "_pre"); @@ -435,7 +438,7 @@ class Transformer : public EncoderOrDecoderBase { // the stack of FF layers for(int i = 1; i < depthFfn; ++i) - output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actName, ffnDropProb); + output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, initFn, actName, ffnDropProb); output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel, initFn); auto opsPost = opt("transformer-postprocess"); @@ -538,6 +541,13 @@ class Transformer : public EncoderOrDecoderBase { decoderState = rnn->lastCellStates()[0]; output = transposeTimeBatch(output); + if(opt("transformer-rnn-projection", false)) { + int dimModel = output->shape()[-1]; + auto Wo = graph_->param(prefix + "_Wo", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); + auto bo = graph_->param(prefix + "_bo", {1, dimModel}, inits::zeros()); + output = affine(output, Wo, bo); // [-4: beam depth, -3: batch size, -2: 1, -1: vector dim] + } + auto opsPost = opt("transformer-postprocess"); output = postProcess(prefix + "_ffn", opsPost, output, input, dropProb); diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index ade8b4892..edec0e1a7 100755 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -70,6 +70,9 @@ template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor > >, marian::functional::Capture> > > >>(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor > >, marian::functional::Capture> > > >, IntrusivePtr); template void marian::gpu::Element, marian::functional::UnaryFunctor > >, marian::Tensor >(marian::functional::Assign, marian::functional::UnaryFunctor > >, marian::Tensor, marian::Tensor); template void marian::gpu::Element, marian::functional::UnaryFunctor > >, marian::Tensor >(marian::functional::Assign, marian::functional::UnaryFunctor > >, marian::Tensor, marian::Tensor); +template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr, IntrusivePtr); // How to add new specializations: // When you use a new specialization, it will cause a link error of this form (example): // .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element ( ... )' diff --git a/src/tests/dropout.cpp b/src/tests/dropout.cpp index 367029fe8..97a30b4f6 100644 --- a/src/tests/dropout.cpp +++ b/src/tests/dropout.cpp @@ -7,7 +7,7 @@ using namespace marian; int main(int argc, char** argv) { - auto c = New(argc, argv); + auto c = parseOptions(argc, argv, cli::mode::scoring, false); auto type = c->get("cpu-threads") > 0 ? DeviceType::cpu @@ -20,11 +20,7 @@ int main(int argc, char** argv) { for(int i = 0; i < 10; ++i) { g->clear(); - auto mask1 = g->dropoutMask(0.2, {10, 3072}); - auto mask2 = g->dropoutMask(0.3, {1, 3072}); - auto mask = mask1 + mask2; - debug(mask1, "mask1"); - debug(mask2, "mask2"); + auto mask = g->dropoutMask(0.2, {1000, 16384}); debug(mask, "mask"); g->forward(); } diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index f3b5fda34..236823fe4 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -595,7 +595,7 @@ void tests(DeviceType device, Type floatType = Type::float32) { auto aff1 = affine(A, B, bias); auto aff2 = dot(A, B) + bias; - auto affRelu1 = affineWithRelu(A, B, bias); + auto affRelu1 = affineWithReluDropout(A, B, bias); auto affRelu2 = relu(dot(A, B) + bias); graph->forward(); From 2cd3055d762866b25b892c3c4e164ebb81993c3c Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 27 Sep 2022 18:40:53 +0000 Subject: [PATCH 200/254] Merged PR 25836: Check via hashing if re-syncing in local mode is required * This adds GPU-side hashing to tensors (a hash based on mumurhash3) * The hash is used to check if parameters across nodes have diverged, if yes, resync all parameters and optimizer shards. Before it would resync every N (100 or 200) updates. Now this can be skipped if nothing diverged. --- CHANGELOG.md | 2 ++ VERSION | 2 +- src/CMakeLists.txt | 1 + src/common/hash.h | 3 +- src/functional/operators.h | 45 ++++++++++++++++++++++++ src/tensors/gpu/add_all.inc | 6 ++-- src/tensors/gpu/hash.cu | 57 +++++++++++++++++++++++++++++++ src/tensors/tensor.cpp | 14 ++++---- src/tensors/tensor.h | 11 +++++- src/tensors/tensor_operators.h | 7 ++++ src/training/graph_group.cpp | 36 +++++++++++++++++++ src/training/graph_group.h | 1 + src/training/graph_group_sync.cpp | 6 +--- 13 files changed, 173 insertions(+), 18 deletions(-) create mode 100644 src/tensors/gpu/hash.cu diff --git a/CHANGELOG.md b/CHANGELOG.md index 23008b734..44145b897 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Fused inplace-dropout in FFN layer in Transformer - `--force-decode` option for marian-decoder - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) @@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed fp16 training/inference with factors-combine concat method ### Changed +- Parameter synchronization in local sharding model now executes hash checksum before syncing - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce - Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. - Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. diff --git a/VERSION b/VERSION index 316ba050f..d15b7998b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.9 +v1.11.11 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4599c407..f095f2eb8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -177,6 +177,7 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY if(CUDA_FOUND) cuda_add_library(marian_cuda tensors/gpu/device.cu + tensors/gpu/hash.cu tensors/gpu/algorithm.cu tensors/gpu/prod.cpp tensors/gpu/prod.cu diff --git a/src/common/hash.h b/src/common/hash.h index 7aca30de2..c2df2a63e 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -18,8 +18,7 @@ inline void hash_combine(HashType& seed, T const& v) { // Hash a whole chunk of memory, mostly used for diagnostics template -inline HashType hashMem(const T* beg, size_t len) { - HashType seed = 0; +inline HashType hashMem(const T* beg, size_t len, HashType seed = 0) { for(auto it = beg; it < beg + len; ++it) hash_combine(seed, *it); return seed; diff --git a/src/functional/operators.h b/src/functional/operators.h index a14f153f1..80b40ff40 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -750,5 +750,50 @@ UNARY(sReLUBack, ReLUback, Ops::reluBack(x)); BINARY(sPReLU, PReLU, Ops::prelu(x, y)); BINARY(sPReLUBack, PReLUback, Ops::preluBack(x, y)); +#ifdef __CUDACC__ +// only visible by nvcc + +DEVICE_INLINE uint32_t gf2u(float f32) { + // binary cast, bits stay the same + return __float_as_uint(f32); +} + +DEVICE_INLINE float gu2f(uint32_t u32) { + // binary cast, bits stay the same + return __uint_as_float(u32); +} + +// this is an adaptation of murmurhash3 as binary operator, all the +// magic numbers are present in the cpu implementation +DEVICE_INLINE uint32_t murmur3_u32(uint32_t seed, uint32_t key) { + uint32_t h = seed; + uint32_t k = key; + + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + k *= 0x1b873593; + + h ^= k; + + h = (h << 13) | (h >> 19); + h = h * 5 + 0xe6546b64; + + return h; +} + +DEVICE_INLINE float murmur3_f32(float seed, float key) { + // We cast from float to uint32_t and the hash back to float. + // Not great, but allows us to hack the float-specific reduction function to accumulate a hash value. + // This is not exactly murmurhash3 since we do a tree-reduction of hashes while murmur hash combines + // values linearly in memory order. But when tested this seems to work just as well for hashing purposes. + return gu2f(murmur3_u32(gf2u(seed), gf2u(key))); +} + +// Define a binary operator that allows for hashing inside the Marian low-level operator framework. +// For now, gpu-side only. +BINARY(Murmur, murmur, murmur3_f32(x, y)); + +#endif + } // end namespace functional } // end namespace marian diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc index b6cb34173..ba466d895 100644 --- a/src/tensors/gpu/add_all.inc +++ b/src/tensors/gpu/add_all.inc @@ -36,10 +36,11 @@ template void AggregateAll, BinaryFunctor, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); template void marian::AggregateAll >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); -template void marian::AggregateAll,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); +template void marian::AggregateAll,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); template void marian::AggregateAll >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void marian::AggregateAll, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); #if COMPILE_FP16 template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -77,8 +78,9 @@ template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); -template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); +template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,marian::functional::BinaryFunctor,marian::functional::Assignee<2> > >(std::shared_ptr,marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,marian::functional::BinaryFunctor,marian::functional::Assignee<2> >,float,IntrusivePtr,IntrusivePtr,IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); #endif diff --git a/src/tensors/gpu/hash.cu b/src/tensors/gpu/hash.cu new file mode 100644 index 000000000..e132cdc4f --- /dev/null +++ b/src/tensors/gpu/hash.cu @@ -0,0 +1,57 @@ +#include "tensors/gpu/add_all.h" +#include "functional/operators.h" +// clang-format on + +#include + +#if COMPILE_FP16 +#include +#endif + +namespace marian { +namespace gpu { + +// cpu-side conversion of float to uint32_t via bit-wise cast +uint32_t f2u(float f32) { + uint32_t u32; + std::memcpy(&u32, &f32, 4); + return u32; +} + +// cpu-side conversion of uint32_t to float via bit-wise cast +float u2f(uint32_t u32) { + float f32; + std::memcpy(&f32, &u32, 4); + return f32; +} + +// Computes a murmur3-ish hash value for a Marian tensor. +uint32_t hashTensor(Tensor tensor, uint32_t seed, Ptr allocator) { + // we first accumulate into single value via a binary mumurhash3-like operator, + // see functional/operators.h for details. + using namespace functional; + uint32_t h = 0; + if(tensor->type() == Type::float32) + h = f2u(AggregateAllAndReturn(allocator, _1, u2f(seed), murmur(_1, _2), 1, tensor)); +#if COMPILE_FP16 + else if(tensor->type() == Type::float16) + // internally, a half value gets cast to a float value before hashing or combining. These is the same + // mechanics as for summing where we cast to a larger type for better precision. + h = f2u(AggregateAllAndReturn(allocator, _1, u2f(seed), murmur(_1, _2), 1, tensor)); +#endif + else + ABORT("Hashing of tensors not supported for type {}", tensor->type()); + + // finalization according to murmurhash3 implementation + uint32_t len = (uint32_t)tensor->size(); + h ^= len; + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +} // namespace gpu +} // namespace marian \ No newline at end of file diff --git a/src/tensors/tensor.cpp b/src/tensors/tensor.cpp index 02de17bc5..e9a07ab46 100644 --- a/src/tensors/tensor.cpp +++ b/src/tensors/tensor.cpp @@ -138,13 +138,13 @@ void TensorBase::set(const io::Item& item) { memory_->data()); } -size_t TensorBase::hash() { - io::Item temp; - size_t seed = 0; - get(temp, "temp"); - for(auto c : temp.bytes) - util::hash_combine(seed, c); - return seed; +size_t TensorBase::hash(size_t seed, Ptr allocator) { +#ifdef CUDA_FOUND + if(backend_->getDeviceId().type == DeviceType::gpu) + return marian::gpu::hashTensor(this, (uint32_t)seed, allocator); + else // we assmume CPU +#endif + return marian::util::hashMem(memory_->data(), memory_->size(), seed); } } // namespace marian diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index a70714043..48e3aaec9 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -3,8 +3,10 @@ #include "common/definitions.h" #include "common/shape.h" #include "common/types.h" +#include "tensors/allocator.h" #include "tensors/backend.h" #include "tensors/memory_piece.h" + #ifdef CUDA_FOUND #include "tensors/gpu/algorithm.h" #endif @@ -327,7 +329,14 @@ class TensorBase { DISPATCH_BY_TYPE2(type_, debug, precision, dispCols); } - size_t hash(); + // Computes a hash value for the given tensor, for a cpu-side tensor this is + // going to be the hash function from stdlib (64-bit), for gpu-side tensors + // it is going to be the result of a mumurhash3-like hash (32-bit). + // The argument seed can be used to define a new random hash function. + // The allocator argument can be used to allocate memory via the standard + // marian allocator instead of cudaMalloc (the default). + // The hashes are not the same for cpu and gpu! + size_t hash(size_t seed = 0, Ptr allocator = nullptr); }; diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 1fc4542d8..178bb6920 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -436,5 +436,12 @@ static inline float L2Norm(marian::Tensor in, Ptr allocator) { // clang-format off DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) + +#ifdef CUDA_FOUND +namespace gpu { + uint32_t hashTensor(Tensor tensor, uint32_t seed, Ptr allocator); +} +#endif + // clang-format on } // namespace marian diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index d9a77a708..0ba1b279d 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -91,6 +91,42 @@ void GraphGroup::initGraphsAndOpts() { } } +void GraphGroup::syncParametersAndShards() { + // In local model we have seen that parameters can diverge occasionally due to non-determinism in NCCL. + // Here, we try to catch this and if caught, re-sync everything (also optimizer state) across nodes. + if(shardingMode_ == ShardingMode::local) { + std::vector hashes(mpi_->numMPIProcesses(), 0); + // compute hash value of parameters of 0-th graph (we only need to check one graph per node) + for(int i = 0; i < hashes.size(); i++) { + if(i == mpi_->myMPIRank()) { + hashes[i] = graphs_[0]->params()->vals()->hash(); // this is quite fast with on-GPU implementation + LOG(debug, "Parameter hash for graph 0 on node {}: {}", mpi_->myMPIRank(), hashes[i]); + } + } + + // Collect hashes from all nodes, note changing rootRank. + // After this hashes contains all hashes from all nodes. + for(int i = 0; i < hashes.size(); i++) + mpi_->bCast(&hashes[i], 1, mpi_->getDataType(&hashes[i]), /*rootRank=*/i); + + // If any of the hashes diverges, re-sync. + if(std::any_of(hashes.begin(), hashes.end(), [&hashes](size_t v){ return v != hashes[0]; })) { + if(isMainProcess()) { + LOG(warn, "Parameters diverged:"); + for(int i = 0; i < hashes.size(); i++) + LOG(warn, "\tGot hash {} for node {}", hashes[i], i); + LOG(warn, "Syncing all parameters and optimizer shards across {} MPI processes", mpi_->numMPIProcesses()); + } + + comm_->broadcastParams(); + comm_->broadcastShards(optimizerShards_); + + if(isMainProcess()) + LOG(warn, "Re-synced all shards"); + } + } +} + // increase cost-scaling factor if no NaN has been detected for a // given number of iterations. Usually we increase by 2 which adds // one more bit for precision. diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 9f1362e75..0895caa77 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -85,6 +85,7 @@ class GraphGroup { GraphGroup(Ptr options); void initGraphsAndOpts(); + void syncParametersAndShards(); virtual ~GraphGroup() {} diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index c90a384e4..a3eee8a7b 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -346,11 +346,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num scheduler_->update(localLoss, numReadBatches, updateBatchSize, updateTargetWords, gradNorm); if(scheduler_->syncing()) { - if(shardingMode_ == ShardingMode::local) { - LOG(debug, "Syncing all parameters and optimizer shards across {} MPI processes", mpi_->numMPIProcesses()); - comm_->broadcastParams(); - comm_->broadcastShards(optimizerShards_); - } + syncParametersAndShards(); } // save intermediate model (and optimizer state) to file From 2c55cdb3c0d37e6522f79dbff56e8ee24bb4c61e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 29 Sep 2022 19:01:49 +0000 Subject: [PATCH 201/254] Merged PR 25889: Fixes bad memory access problem in hashing Fix bad memory access problem in hashing by using the graph allocator --- CHANGELOG.md | 1 + VERSION | 2 +- src/training/graph_group.cpp | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44145b897..b06abec78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) ### Fixed +- Use allocator in hashing - Read/restore checkpoints from main process only when training with MPI - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) - Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) diff --git a/VERSION b/VERSION index d15b7998b..fb1b4bd9d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.11 +v1.11.12 diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 0ba1b279d..cb95470f4 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -99,7 +99,8 @@ void GraphGroup::syncParametersAndShards() { // compute hash value of parameters of 0-th graph (we only need to check one graph per node) for(int i = 0; i < hashes.size(); i++) { if(i == mpi_->myMPIRank()) { - hashes[i] = graphs_[0]->params()->vals()->hash(); // this is quite fast with on-GPU implementation + auto allocator = graphs_[0]->allocator(); + hashes[i] = graphs_[0]->params()->vals()->hash(1234, allocator); // this is quite fast with on-GPU implementation LOG(debug, "Parameter hash for graph 0 on node {}: {}", mpi_->myMPIRank(), hashes[i]); } } From 1e92cff93d93d3bdd229022012e281a1ef3fc494 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 4 Oct 2022 00:42:52 +0000 Subject: [PATCH 202/254] Merged PR 25919: Sync with public master - no review required Sync with public master, checking compilation, regression tests etc. --- .github/workflows/macos.yml | 8 +- .github/workflows/ubuntu.yml | 58 +- CHANGELOG.md | 7 +- README.md | 8 +- VERSION | 2 +- doc/.gitignore | 1 + doc/Makefile | 3 +- doc/conf.py | 13 - doc/index.rst | 22 +- doc/requirements.txt | 4 +- examples | 2 +- regression-tests | 2 +- src/3rd_party/catch.hpp | 2013 +++++++++++++++---------- src/3rd_party/intgemm | 2 +- src/3rd_party/simple-websocket-server | 2 +- src/common/config_validator.cpp | 2 + src/data/factored_vocab.cpp | 2 +- src/data/shortlist.h | 3 - 18 files changed, 1332 insertions(+), 822 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 20907d9b6..f06eed256 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -9,7 +9,7 @@ on: jobs: build-macos: name: MacOS CPU-only - runs-on: macos-10.15 + runs-on: macos-12 steps: - name: Checkout @@ -18,10 +18,12 @@ jobs: submodules: recursive - name: Install dependencies - run: brew install boost openssl protobuf + run: brew install boost openblas openssl protobuf - name: Configure CMake run: | + export LDFLAGS="-L/usr/local/opt/openblas/lib" + export CPPFLAGS="-I/usr/local/opt/openblas/include" mkdir -p build cd build cmake .. \ @@ -48,4 +50,4 @@ jobs: ./marian-decoder --version ./marian-scorer --version ./spm_encode --version - + ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \)) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 4a0fa6746..bc01b74a8 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -13,42 +13,58 @@ jobs: include: # Ubuntu CPU-only build - name: "Ubuntu CPU-only" - os: ubuntu-18.04 + os: ubuntu-20.04 cuda: "" - gcc: 7 + gcc: 9 + clang: "" + cpu: true + gpu: false + unit_tests: true + examples: false + # Using Clang compiler + - name: "Ubuntu CPU-only clang-14" + os: ubuntu-22.04 + cuda: "" + gcc: "" + clang: 14 cpu: true gpu: false unit_tests: true examples: false # Ubuntu GPU-only build - name: "Ubuntu GPU-only" - os: ubuntu-18.04 - cuda: "10.2" - gcc: 7 + os: ubuntu-20.04 + cuda: "11.1" + gcc: 9 + clang: "" cpu: false gpu: true unit_tests: false examples: true - # Ubuntu 20.04 supports CUDA 11+ + # Ubuntu 22.04 supports CUDA 11.7 # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 20.04 CUDA 11.2 gcc-9" - os: ubuntu-20.04 - cuda: "11.2" - gcc: 9 + - name: "Ubuntu 22.04 CUDA 11.7 gcc-11" + os: ubuntu-22.04 + cuda: "11.7" + gcc: 11 + clang: "" cpu: false gpu: true unit_tests: false examples: false - # Ubuntu 18.04 supports CUDA 10.1+ + # Ubuntu 20.04 supports CUDA 11+ # Unit tests and examples are not compiled to save disk space - - name: "Ubuntu 18.04 CUDA 10.2 gcc-8" - os: ubuntu-18.04 - cuda: "10.2" - gcc: 8 + - name: "Ubuntu 20.04 CUDA 11.1 gcc-9" + os: ubuntu-20.04 + cuda: "11.1" + gcc: 9 + clang: "" cpu: true gpu: true unit_tests: false examples: false + # Ubuntu 18.04 supports CUDA 10.1+ + # But it will soon be removed from GitHub workflows # Ubuntu 16.04 supports CUDA 8+ # But it is no longer available in GitHub workflows @@ -64,10 +80,13 @@ jobs: # The following packages are already installed on GitHub-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev # Boost is no longer pre-installed on GitHub-hosted runners + # Clang 12, 13 and 14 are pre-installed on the ubuntu-22.04 image + # Note that installation of libunwind-dev is a bug fix for ubuntu-22.04 images on Azure/GitHub-hosted machines + # and is normally not required - name: Install dependencies run: | - sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev \ - gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} + sudo apt-get install -y libunwind-dev libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-system-dev + [ -z "${{ matrix.gcc }}" ] || sudo apt-get install -y gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL @@ -86,9 +105,10 @@ jobs: # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671 - name: Configure CMake run: | + [ -z "${{ matrix.gcc }}" ] || export CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} + [ -z "${{ matrix.clang }}" ] || export CC=/usr/bin/clang-${{ matrix.clang }} CXX=/usr/bin/clang++-${{ matrix.clang }} mkdir -p build cd build - CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \ cmake .. \ -DBoost_ARCHITECTURE=-x64 \ -DCMAKE_BUILD_TYPE=Release \ @@ -122,4 +142,4 @@ jobs: ./marian-scorer --version ./marian-server --version ./spm_encode --version - + ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \)) diff --git a/CHANGELOG.md b/CHANGELOG.md index b06abec78..f93148e87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,23 +18,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Read/restore checkpoints from main process only when training with MPI - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) - Throw `ShapeSizeException` if total expanded shape size exceeds numeric capacity of the maximum int value (2^31-1) -- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. +- During mini-batch-fitting, catch `ShapeSizeException` and use another sizing hint. Aborts outside mini-batch-fitting. - Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. - Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load - Fixed check for `fortran_ordering` in cnpy - Fixed fp16 training/inference with factors-combine concat method +- Fixed clang 13.0.1 compatibility +- Fixed potential vulnerabilities from lxml<4.9.1 or mistune<2.0.31 ### Changed - Parameter synchronization in local sharding model now executes hash checksum before syncing - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce -- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. +- Negative `--workspace -N` value allocates workspace as total available GPU memory minus N megabytes. - Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU - Updated intgemm to the latest upstream version - Parameters in npz files are no longer implicitly assumed to be row-ordered. Non row-ordered parameters will result in an abort +- Updated Catch2 header from 2.10.1 to 2.13.9 ## [1.11.0] - 2022-02-08 diff --git a/README.md b/README.md index 7fa003e19..a8d84c2af 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ Marian ====== - -[![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDA%2010.2)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/) -[![Build Status CUDA 11](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-11.4.svg?label=CUDA%2011.4)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-11.4/) -[![Build Status CPU](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/) -[![Tests Status](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=tests)](http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/) +[![Ubuntu](https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml) +[![Windows](https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml) +[![MacOS](https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml/badge.svg)](https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml) [![Latest release](https://img.shields.io/github/release/marian-nmt/marian.svg?label=release)](https://github.com/marian-nmt/marian/releases) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE.md) [![Twitter](https://img.shields.io/twitter/follow/marian_nmt.svg?style=social)](https://twitter.com/intent/follow?screen_name=marian_nmt) diff --git a/VERSION b/VERSION index fb1b4bd9d..9ec465949 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.12 +v1.11.13 diff --git a/doc/.gitignore b/doc/.gitignore index 4d192b770..47b72ab07 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -2,3 +2,4 @@ api build doxygen venv +CONTRIBUTING.md diff --git a/doc/Makefile b/doc/Makefile index 84310d9dc..aa2048b8c 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -14,10 +14,11 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile + cp $(SOURCEDIR)/../CONTRIBUTING.md $(SOURCEDIR)/ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Clean target as recommended by Exhale # https://exhale.readthedocs.io/en/latest/usage.html#optional-create-a-proper-clean-target clean: - rm -rf doxygen/ api/ + rm -rf doxygen/ api/ $(SOURCEDIR)/CONTRIBUTING.md @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/conf.py b/doc/conf.py index b0c68bcdf..192dd27dd 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -114,16 +114,3 @@ primary_domain = 'cpp' highlight_language = 'cpp' - -# A trick to include markdown files from outside the source directory using -# 'mdinclude'. Warning: all other markdown files not included via 'mdinclude' -# will be rendered using recommonmark as recommended by Sphinx -from m2r import MdInclude - -def setup(app): - # from m2r to make `mdinclude` work - app.add_config_value('no_underscore_emphasis', False, 'env') - app.add_config_value('m2r_parse_relative_links', False, 'env') - app.add_config_value('m2r_anonymous_references', False, 'env') - app.add_config_value('m2r_disable_inline_math', False, 'env') - app.add_directive('mdinclude', MdInclude) diff --git a/doc/index.rst b/doc/index.rst index d19bb4b00..9d769c32d 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,7 +1,7 @@ Welcome to Marian's documentation! ================================== -|buildgpu| |buildcpu| |tests| |release| |license| +|ubuntu| |windows| |macos| |release| |license| Marian is an efficient and self-contained Neural Machine Translation framework with an integrated automatic differentiation engine based on dynamic computation graphs, written entirely in C++. @@ -19,7 +19,7 @@ This is developer documentation. User documentation is available at https://mari factors api/library_index - contributing + CONTRIBUTING doc_guide @@ -30,17 +30,17 @@ Indices and tables * :ref:`genindex` -.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.2.svg?label=CUDAC%20Build - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.2/ - :alt: GPU build status +.. |ubuntu| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/ubuntu.yml + :alt: Ubuntu build status -.. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU%20Build - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/ - :alt: CPU build status +.. |windows| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/windows.yml + :alt: Windows build status -.. |tests| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=Tests - :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/ - :alt: Tests status +.. |macos| image:: https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml/badge.svg + :target: https://github.com/marian-nmt/marian-dev/actions/workflows/macos.yml + :alt: MacOS build status .. |release| image:: https://img.shields.io/github/release/marian-nmt/marian.svg?label=Release :target: https://github.com/marian-nmt/marian/releases diff --git a/doc/requirements.txt b/doc/requirements.txt index 40de5ddd9..a2f87dd91 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,9 +1,9 @@ +lxml>=4.9.1 +docutils<=0.17 sphinx==2.4.4 breathe==4.13.0 exhale sphinx_rtd_theme myst-parser==0.14.0a3 -mistune<2.0.0 -m2r sphinx-mathjax-offline Jinja2<3.1 diff --git a/examples b/examples index 29f4f7c38..25e843832 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 29f4f7c380c860a95b9375813f4b199b2e6b5556 +Subproject commit 25e84383225a29f769e362250654ddf256d06261 diff --git a/regression-tests b/regression-tests index 4fa9ff55a..92e116efa 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 4fa9ff55af68bc87d8bd04c9b410f1e1d3874718 +Subproject commit 92e116efa369d6ed848c8eb19dfcef8bf7245d71 diff --git a/src/3rd_party/catch.hpp b/src/3rd_party/catch.hpp index 5d104bc46..07efa655e 100644 --- a/src/3rd_party/catch.hpp +++ b/src/3rd_party/catch.hpp @@ -1,9 +1,9 @@ /* - * Catch v2.10.1 - * Generated: 2019-10-20 20:52:21.372334 + * Catch v2.13.9 + * Generated: 2022-04-12 22:37:23.260201 * ---------------------------------------------------------- * This file has been merged from multiple headers. Please don't edit it directly - * Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved. + * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. * * Distributed under the Boost Software License, Version 1.0. (See accompanying * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -14,8 +14,8 @@ #define CATCH_VERSION_MAJOR 2 -#define CATCH_VERSION_MINOR 10 -#define CATCH_VERSION_PATCH 1 +#define CATCH_VERSION_MINOR 13 +#define CATCH_VERSION_PATCH 9 #ifdef __clang__ # pragma clang system_header @@ -66,13 +66,16 @@ #if !defined(CATCH_CONFIG_IMPL_ONLY) // start catch_platform.h +// See e.g.: +// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html #ifdef __APPLE__ -# include -# if TARGET_OS_OSX == 1 -# define CATCH_PLATFORM_MAC -# elif TARGET_OS_IPHONE == 1 -# define CATCH_PLATFORM_IPHONE -# endif +# include +# if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ + (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) +# define CATCH_PLATFORM_MAC +# elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) +# define CATCH_PLATFORM_IPHONE +# endif #elif defined(linux) || defined(__linux) || defined(__linux__) # define CATCH_PLATFORM_LINUX @@ -132,42 +135,52 @@ namespace Catch { #endif -#if defined(CATCH_CPP17_OR_GREATER) -# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +// Only GCC compiler should be used in this block, so other compilers trying to +// mask themselves as GCC should be ignored. +#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) + +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) + #endif -#ifdef __clang__ +#if defined(__clang__) + +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) + +// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug +// which results in calls to destructors being emitted for each temporary, +// without a matching initialization. In practice, this can result in something +// like `std::string::~string` being called on an uninitialized value. +// +// For example, this code will likely segfault under IBM XL: +// ``` +// REQUIRE(std::string("12") + "34" == "1234") +// ``` +// +// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. +# if !defined(__ibmxl__) && !defined(__CUDACC__) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ +# endif + +# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ + _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") + +# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) + +# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) + +# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) + +# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) -# define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ - _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") -# define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - _Pragma( "clang diagnostic pop" ) - -# define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ - _Pragma( "clang diagnostic push" ) \ - _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ - _Pragma( "clang diagnostic pop" ) #endif // __clang__ //////////////////////////////////////////////////////////////////////////////// @@ -225,11 +238,7 @@ namespace Catch { //////////////////////////////////////////////////////////////////////////////// // Visual C++ -#ifdef _MSC_VER - -# if _MSC_VER >= 1900 // Visual Studio 2015 or newer -# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS -# endif +#if defined(_MSC_VER) // Universal Windows platform does not support SEH // Or console colours (or console at all...) @@ -239,12 +248,20 @@ namespace Catch { # define CATCH_INTERNAL_CONFIG_WINDOWS_SEH # endif +# if !defined(__clang__) // Handle Clang masquerading for msvc + // MSVC traditional preprocessor needs some workaround for __VA_ARGS__ // _MSVC_TRADITIONAL == 0 means new conformant preprocessor // _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor -# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) -# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR -# endif +# if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) +# define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR +# endif // MSVC_TRADITIONAL + +// Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) +# endif // __clang__ + #endif // _MSC_VER #if defined(_REENTRANT) || defined(_MSC_VER) @@ -292,7 +309,7 @@ namespace Catch { #define CATCH_CONFIG_COLOUR_NONE #endif -#if defined(__UCLIBC__) +#if !defined(_GLIBCXX_USE_C99_MATH_TR1) #define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER #endif @@ -310,7 +327,10 @@ namespace Catch { // Check if byte is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) - # define CATCH_INTERNAL_CONFIG_CPP17_BYTE + # include + # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) + # define CATCH_INTERNAL_CONFIG_CPP17_BYTE + # endif # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if variant is available and usable @@ -353,10 +373,6 @@ namespace Catch { # define CATCH_CONFIG_CPP17_OPTIONAL #endif -#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) -# define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS -#endif - #if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) # define CATCH_CONFIG_CPP17_STRING_VIEW #endif @@ -397,34 +413,41 @@ namespace Catch { # define CATCH_CONFIG_GLOBAL_NEXTAFTER #endif +// Even if we do not think the compiler has that warning, we still have +// to provide a macro that can be used by the code. +#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION +#endif +#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) +# define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION +#endif #if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS +#endif + +// The goal of this macro is to avoid evaluation of the arguments, but +// still have the compiler warn on problems inside... +#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) +# define CATCH_INTERNAL_IGNORE_BUT_WARN(...) #endif #if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# undef CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #elif defined(__clang__) && (__clang_major__ < 5) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# undef CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS -# define CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) @@ -530,9 +553,10 @@ namespace Catch { } // end namespace Catch #define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_tag_alias_autoregistrar.h // start catch_test_registry.h @@ -578,49 +602,24 @@ namespace Catch { /// A non-owning string class (similar to the forthcoming std::string_view) /// Note that, because a StringRef may be a substring of another string, - /// it may not be null terminated. c_str() must return a null terminated - /// string, however, and so the StringRef will internally take ownership - /// (taking a copy), if necessary. In theory this ownership is not externally - /// visible - but it does mean (substring) StringRefs should not be shared between - /// threads. + /// it may not be null terminated. class StringRef { public: using size_type = std::size_t; using const_iterator = const char*; private: - friend struct StringRefTestAccess; - - char const* m_start; - size_type m_size; - - char* m_data = nullptr; - - void takeOwnership(); - static constexpr char const* const s_empty = ""; - public: // construction/ assignment - StringRef() noexcept - : StringRef( s_empty, 0 ) - {} - - StringRef( StringRef const& other ) noexcept - : m_start( other.m_start ), - m_size( other.m_size ) - {} + char const* m_start = s_empty; + size_type m_size = 0; - StringRef( StringRef&& other ) noexcept - : m_start( other.m_start ), - m_size( other.m_size ), - m_data( other.m_data ) - { - other.m_data = nullptr; - } + public: // construction + constexpr StringRef() noexcept = default; StringRef( char const* rawChars ) noexcept; - StringRef( char const* rawChars, size_type size ) noexcept + constexpr StringRef( char const* rawChars, size_type size ) noexcept : m_start( rawChars ), m_size( size ) {} @@ -630,27 +629,15 @@ namespace Catch { m_size( stdString.size() ) {} - ~StringRef() noexcept { - delete[] m_data; - } - - auto operator = ( StringRef const &other ) noexcept -> StringRef& { - delete[] m_data; - m_data = nullptr; - m_start = other.m_start; - m_size = other.m_size; - return *this; - } - explicit operator std::string() const { return std::string(m_start, m_size); } - void swap( StringRef& other ) noexcept; - public: // operators auto operator == ( StringRef const& other ) const noexcept -> bool; - auto operator != ( StringRef const& other ) const noexcept -> bool; + auto operator != (StringRef const& other) const noexcept -> bool { + return !(*this == other); + } auto operator[] ( size_type index ) const noexcept -> char { assert(index < m_size); @@ -658,41 +645,44 @@ namespace Catch { } public: // named queries - auto empty() const noexcept -> bool { + constexpr auto empty() const noexcept -> bool { return m_size == 0; } - auto size() const noexcept -> size_type { + constexpr auto size() const noexcept -> size_type { return m_size; } + // Returns the current start pointer. If the StringRef is not + // null-terminated, throws std::domain_exception auto c_str() const -> char const*; public: // substrings and searches - auto substr( size_type start, size_type size ) const noexcept -> StringRef; + // Returns a substring of [start, start + length). + // If start + length > size(), then the substring is [start, size()). + // If start > size(), then the substring is empty. + auto substr( size_type start, size_type length ) const noexcept -> StringRef; - // Returns the current start pointer. - // Note that the pointer can change when if the StringRef is a substring - auto currentData() const noexcept -> char const*; + // Returns the current start pointer. May not be null-terminated. + auto data() const noexcept -> char const*; - public: // iterators - const_iterator begin() const { return m_start; } - const_iterator end() const { return m_start + m_size; } + constexpr auto isNullTerminated() const noexcept -> bool { + return m_start[m_size] == '\0'; + } - private: // ownership queries - may not be consistent between calls - auto isOwned() const noexcept -> bool; - auto isSubstring() const noexcept -> bool; + public: // iterators + constexpr const_iterator begin() const { return m_start; } + constexpr const_iterator end() const { return m_start + m_size; } }; auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; - inline auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { + constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { return StringRef( rawChars, size ); } - } // namespace Catch -inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { +constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { return Catch::StringRef( rawChars, size ); } @@ -781,7 +771,7 @@ inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noex #define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) #define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) #define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) -#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _4, _5, _6) +#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) #define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) #define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) #define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) @@ -931,22 +921,33 @@ inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noex #include namespace Catch { -template -struct always_false : std::false_type {}; + template + struct always_false : std::false_type {}; + + template struct true_given : std::true_type {}; + struct is_callable_tester { + template + true_given()(std::declval()...))> static test(int); + template + std::false_type static test(...); + }; -template struct true_given : std::true_type {}; -struct is_callable_tester { - template - true_given()(std::declval()...))> static test(int); - template - std::false_type static test(...); -}; + template + struct is_callable; -template -struct is_callable; + template + struct is_callable : decltype(is_callable_tester::test(0)) {}; -template -struct is_callable : decltype(is_callable_tester::test(0)) {}; +#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 + // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is + // replaced with std::invoke_result here. + template + using FunctionReturnType = std::remove_reference_t>>; +#else + // Keep ::type here because we still support C++11 + template + using FunctionReturnType = typename std::remove_reference::type>::type>::type; +#endif } // namespace Catch @@ -1011,55 +1012,58 @@ struct AutoReg : NonCopyable { #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) #endif #endif /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \ static void TestName(); \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ static void TestName() #define INTERNAL_CATCH_TESTCASE( ... ) \ - INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), __VA_ARGS__ ) + INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), __VA_ARGS__ ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &QualifiedMethod ), CATCH_INTERNAL_LINEINFO, "&" #QualifiedMethod, Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ \ struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \ @@ -1067,19 +1071,21 @@ struct AutoReg : NonCopyable { }; \ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \ } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ void TestName::test() #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \ - INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, __VA_ARGS__ ) + INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), ClassName, __VA_ARGS__ ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags, Signature, ... )\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1095,7 +1101,7 @@ struct AutoReg : NonCopyable { int index = 0; \ constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\ using expander = int[];\ - (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1104,31 +1110,30 @@ struct AutoReg : NonCopyable { }();\ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature)) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename TestType, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template static void TestFuncName(); \ namespace {\ namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) { \ @@ -1142,7 +1147,7 @@ struct AutoReg : NonCopyable { constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\ constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\ constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */\ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */\ } \ }; \ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \ @@ -1153,29 +1158,28 @@ struct AutoReg : NonCopyable { }(); \ } \ } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ static void TestFuncName() #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T,__VA_ARGS__) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename T,__VA_ARGS__) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name, Tags, TmplList)\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template static void TestFunc(); \ @@ -1187,7 +1191,7 @@ struct AutoReg : NonCopyable { void reg_tests() { \ int index = 0; \ using expander = int[]; \ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */\ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\ } \ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \ @@ -1195,17 +1199,17 @@ struct AutoReg : NonCopyable { TestInit t; \ t.reg_tests(); \ return 0; \ - }(); \ + }(); \ }}\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ static void TestFunc() #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList) \ - INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), Name, Tags, TmplList ) + INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), Name, Tags, TmplList ) #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1221,7 +1225,7 @@ struct AutoReg : NonCopyable { int index = 0; \ constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\ using expander = int[];\ - (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1230,28 +1234,27 @@ struct AutoReg : NonCopyable { }();\ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS\ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS\ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS\ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature)) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_C_L_A_S_S_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes, TypesList)\ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ @@ -1271,7 +1274,7 @@ struct AutoReg : NonCopyable { constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\ constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\ constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + "<" + std::string(types_list[index % num_types]) + ">", Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1282,29 +1285,28 @@ struct AutoReg : NonCopyable { }(); \ }\ }\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_ZERO_VARIADIC_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ void TestName::test() #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, typename T, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) ) #endif #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\ - INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature, __VA_ARGS__ ) + INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, Signature, __VA_ARGS__ ) #else #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\ - INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) ) + INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) ) #endif #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, TmplList) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ template \ @@ -1319,7 +1321,7 @@ struct AutoReg : NonCopyable { void reg_tests(){\ int index = 0;\ using expander = int[];\ - (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++, 0)... };/* NOLINT */ \ + (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) + " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \ }\ };\ static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\ @@ -1329,13 +1331,12 @@ struct AutoReg : NonCopyable { return 0;\ }(); \ }}\ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_TEMPLATE_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ template \ void TestName::test() #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags, TmplList) \ - INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____ ), INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____ ), ClassName, Name, Tags, TmplList ) + INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_ ), INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_M_P_L_A_T_E_T_E_S_T_F_U_N_C_ ), ClassName, Name, Tags, TmplList ) // end catch_test_registry.h // start catch_capture.hpp @@ -1436,7 +1437,7 @@ namespace Catch { auto makeStream( StringRef const &filename ) -> IStream const*; - class ReusableStringStream { + class ReusableStringStream : NonCopyable { std::size_t m_index; std::ostream* m_oss; public: @@ -1824,8 +1825,8 @@ namespace Catch { #endif namespace Detail { - template - std::string rangeToString(InputIterator first, InputIterator last) { + template + std::string rangeToString(InputIterator first, Sentinel last) { ReusableStringStream rss; rss << "{ "; if (first != last) { @@ -1983,20 +1984,27 @@ namespace Catch { #endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER namespace Catch { - struct not_this_one {}; // Tag type for detecting which begin/ end are being selected - - // Import begin/ end from std here so they are considered alongside the fallback (...) overloads in this namespace + // Import begin/ end from std here using std::begin; using std::end; - not_this_one begin( ... ); - not_this_one end( ... ); + namespace detail { + template + struct void_type { + using type = void; + }; + + template + struct is_range_impl : std::false_type { + }; + + template + struct is_range_impl()))>::type> : std::true_type { + }; + } // namespace detail template - struct is_range { - static const bool value = - !std::is_same())), not_this_one>::value && - !std::is_same())), not_this_one>::value; + struct is_range : detail::is_range_impl { }; #if defined(_MANAGED) // Managed types are never ranges @@ -2364,6 +2372,18 @@ namespace Catch { auto operator <= ( RhsT const& rhs ) -> BinaryExpr const { return { static_cast(m_lhs <= rhs), m_lhs, "<=", rhs }; } + template + auto operator | (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs | rhs), m_lhs, "|", rhs }; + } + template + auto operator & (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs & rhs), m_lhs, "&", rhs }; + } + template + auto operator ^ (RhsT const& rhs) -> BinaryExpr const { + return { static_cast(m_lhs ^ rhs), m_lhs, "^", rhs }; + } template auto operator && ( RhsT const& ) -> BinaryExpr const { @@ -2444,7 +2464,7 @@ namespace Catch { virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0; virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0; - virtual auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0; + virtual auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0; #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) virtual void benchmarkPreparing( std::string const& name ) = 0; @@ -2682,15 +2702,16 @@ namespace Catch { /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \ do { \ + CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \ Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \ INTERNAL_CATCH_TRY { \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); \ - CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \ INTERNAL_CATCH_REACT( catchAssertionHandler ) \ - } while( (void)0, (false) && static_cast( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look - // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&. + } while( (void)0, (false) && static_cast( !!(__VA_ARGS__) ) ) /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \ @@ -2907,14 +2928,16 @@ namespace Catch { } // end namespace Catch #define INTERNAL_CATCH_SECTION( ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \ - CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_section.h // start catch_interfaces_exception.h @@ -3005,6 +3028,9 @@ namespace Catch { {} std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override { +#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) + return ""; +#else try { if( it == itEnd ) std::rethrow_exception(std::current_exception()); @@ -3014,6 +3040,7 @@ namespace Catch { catch( T& ex ) { return m_translateFunction( ex ); } +#endif } protected: @@ -3032,9 +3059,10 @@ namespace Catch { /////////////////////////////////////////////////////////////////////////////// #define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \ static std::string translatorName( signature ); \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \ static std::string translatorName( signature ) #define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature ) @@ -3065,7 +3093,7 @@ namespace Detail { Approx operator-() const; template ::value>::type> - Approx operator()( T const& value ) { + Approx operator()( T const& value ) const { Approx approx( static_cast(value) ); approx.m_epsilon = m_epsilon; approx.m_margin = m_margin; @@ -3281,9 +3309,10 @@ namespace Matchers { return description; } - MatchAllOf& operator && ( MatcherBase const& other ) { - m_matchers.push_back( &other ); - return *this; + MatchAllOf operator && ( MatcherBase const& other ) { + auto copy(*this); + copy.m_matchers.push_back( &other ); + return copy; } std::vector const*> m_matchers; @@ -3314,9 +3343,10 @@ namespace Matchers { return description; } - MatchAnyOf& operator || ( MatcherBase const& other ) { - m_matchers.push_back( &other ); - return *this; + MatchAnyOf operator || ( MatcherBase const& other ) { + auto copy(*this); + copy.m_matchers.push_back( &other ); + return copy; } std::vector const*> m_matchers; @@ -3573,12 +3603,12 @@ namespace Catch { namespace Matchers { namespace Vector { - template - struct ContainsElementMatcher : MatcherBase> { + template + struct ContainsElementMatcher : MatcherBase> { ContainsElementMatcher(T const &comparator) : m_comparator( comparator) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { for (auto const& el : v) { if (el == m_comparator) { return true; @@ -3594,12 +3624,12 @@ namespace Matchers { T const& m_comparator; }; - template - struct ContainsMatcher : MatcherBase> { + template + struct ContainsMatcher : MatcherBase> { - ContainsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} + ContainsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { // !TBD: see note in EqualsMatcher if (m_comparator.size() > v.size()) return false; @@ -3621,18 +3651,18 @@ namespace Matchers { return "Contains: " + ::Catch::Detail::stringify( m_comparator ); } - std::vector const& m_comparator; + std::vector const& m_comparator; }; - template - struct EqualsMatcher : MatcherBase> { + template + struct EqualsMatcher : MatcherBase> { - EqualsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} + EqualsMatcher(std::vector const &comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { // !TBD: This currently works if all elements can be compared using != // - a more general approach would be via a compare template that defaults - // to using !=. but could be specialised for, e.g. std::vector etc + // to using !=. but could be specialised for, e.g. std::vector etc // - then just call that directly if (m_comparator.size() != v.size()) return false; @@ -3644,15 +3674,15 @@ namespace Matchers { std::string describe() const override { return "Equals: " + ::Catch::Detail::stringify( m_comparator ); } - std::vector const& m_comparator; + std::vector const& m_comparator; }; - template - struct ApproxMatcher : MatcherBase> { + template + struct ApproxMatcher : MatcherBase> { - ApproxMatcher(std::vector const& comparator) : m_comparator( comparator ) {} + ApproxMatcher(std::vector const& comparator) : m_comparator( comparator ) {} - bool match(std::vector const &v) const override { + bool match(std::vector const &v) const override { if (m_comparator.size() != v.size()) return false; for (std::size_t i = 0; i < v.size(); ++i) @@ -3679,16 +3709,14 @@ namespace Matchers { return *this; } - std::vector const& m_comparator; + std::vector const& m_comparator; mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom(); }; - template - struct UnorderedEqualsMatcher : MatcherBase> { - UnorderedEqualsMatcher(std::vector const& target) : m_target(target) {} - bool match(std::vector const& vec) const override { - // Note: This is a reimplementation of std::is_permutation, - // because I don't want to include inside the common path + template + struct UnorderedEqualsMatcher : MatcherBase> { + UnorderedEqualsMatcher(std::vector const& target) : m_target(target) {} + bool match(std::vector const& vec) const override { if (m_target.size() != vec.size()) { return false; } @@ -3699,7 +3727,7 @@ namespace Matchers { return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target); } private: - std::vector const& m_target; + std::vector const& m_target; }; } // namespace Vector @@ -3707,29 +3735,29 @@ namespace Matchers { // The following functions create the actual matcher objects. // This allows the types to be inferred - template - Vector::ContainsMatcher Contains( std::vector const& comparator ) { - return Vector::ContainsMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::ContainsMatcher Contains( std::vector const& comparator ) { + return Vector::ContainsMatcher( comparator ); } - template - Vector::ContainsElementMatcher VectorContains( T const& comparator ) { - return Vector::ContainsElementMatcher( comparator ); + template> + Vector::ContainsElementMatcher VectorContains( T const& comparator ) { + return Vector::ContainsElementMatcher( comparator ); } - template - Vector::EqualsMatcher Equals( std::vector const& comparator ) { - return Vector::EqualsMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::EqualsMatcher Equals( std::vector const& comparator ) { + return Vector::EqualsMatcher( comparator ); } - template - Vector::ApproxMatcher Approx( std::vector const& comparator ) { - return Vector::ApproxMatcher( comparator ); + template, typename AllocMatch = AllocComp> + Vector::ApproxMatcher Approx( std::vector const& comparator ) { + return Vector::ApproxMatcher( comparator ); } - template - Vector::UnorderedEqualsMatcher UnorderedEquals(std::vector const& target) { - return Vector::UnorderedEqualsMatcher(target); + template, typename AllocMatch = AllocComp> + Vector::UnorderedEqualsMatcher UnorderedEquals(std::vector const& target) { + return Vector::UnorderedEqualsMatcher( target ); } } // namespace Matchers @@ -3925,7 +3953,6 @@ namespace Generators { class SingleValueGenerator final : public IGenerator { T m_value; public: - SingleValueGenerator(T const& value) : m_value( value ) {} SingleValueGenerator(T&& value) : m_value(std::move(value)) {} T const& get() const override { @@ -3988,21 +4015,21 @@ namespace Generators { m_generators.emplace_back(std::move(generator)); } void populate(T&& val) { - m_generators.emplace_back(value(std::move(val))); + m_generators.emplace_back(value(std::forward(val))); } template void populate(U&& val) { - populate(T(std::move(val))); + populate(T(std::forward(val))); } template - void populate(U&& valueOrGenerator, Gs... moreGenerators) { + void populate(U&& valueOrGenerator, Gs &&... moreGenerators) { populate(std::forward(valueOrGenerator)); populate(std::forward(moreGenerators)...); } public: template - Generators(Gs... moreGenerators) { + Generators(Gs &&... moreGenerators) { m_generators.reserve(sizeof...(Gs)); populate(std::forward(moreGenerators)...); } @@ -4033,7 +4060,7 @@ namespace Generators { struct as {}; template - auto makeGenerators( GeneratorWrapper&& generator, Gs... moreGenerators ) -> Generators { + auto makeGenerators( GeneratorWrapper&& generator, Gs &&... moreGenerators ) -> Generators { return Generators(std::move(generator), std::forward(moreGenerators)...); } template @@ -4041,24 +4068,24 @@ namespace Generators { return Generators(std::move(generator)); } template - auto makeGenerators( T&& val, Gs... moreGenerators ) -> Generators { + auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators { return makeGenerators( value( std::forward( val ) ), std::forward( moreGenerators )... ); } template - auto makeGenerators( as, U&& val, Gs... moreGenerators ) -> Generators { + auto makeGenerators( as, U&& val, Gs &&... moreGenerators ) -> Generators { return makeGenerators( value( T( std::forward( val ) ) ), std::forward( moreGenerators )... ); } - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker&; + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker&; template // Note: The type after -> is weird, because VS2015 cannot parse // the expression used in the typedef inside, when it is in // return type. Yeah. - auto generate( SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval().get()) { + auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval().get()) { using UnderlyingType = typename decltype(generatorExpression())::type; - IGeneratorTracker& tracker = acquireGeneratorTracker( lineInfo ); + IGeneratorTracker& tracker = acquireGeneratorTracker( generatorName, lineInfo ); if (!tracker.hasGenerator()) { tracker.setGenerator(pf::make_unique>(generatorExpression())); } @@ -4071,11 +4098,17 @@ namespace Generators { } // namespace Catch #define GENERATE( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) #define GENERATE_COPY( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) #define GENERATE_REF( ... ) \ - Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) + Catch::Generators::generate( INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \ + CATCH_INTERNAL_LINEINFO, \ + [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace) // end catch_generators.hpp // start catch_generators_generic.hpp @@ -4132,7 +4165,7 @@ namespace Generators { if (!m_predicate(m_generator.get())) { // It might happen that there are no values that pass the // filter. In that case we throw an exception. - auto has_initial_value = next(); + auto has_initial_value = nextImpl(); if (!has_initial_value) { Catch::throw_exception(GeneratorException("No valid value found in filtered generator")); } @@ -4144,6 +4177,11 @@ namespace Generators { } bool next() override { + return nextImpl(); + } + + private: + bool nextImpl() { bool success = m_generator.next(); if (!success) { return false; @@ -4241,18 +4279,7 @@ namespace Generators { } }; -#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 - // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is - // replaced with std::invoke_result here. Also *_t format is preferred over - // typename *::type format. - template - using MapFunctionReturnType = std::remove_reference_t>>; -#else - template - using MapFunctionReturnType = typename std::remove_reference::type>::type>::type; -#endif - - template > + template > GeneratorWrapper map(Func&& function, GeneratorWrapper&& generator) { return GeneratorWrapper( pf::make_unique>(std::forward(function), std::move(generator)) @@ -4438,6 +4465,7 @@ namespace Catch { } // end namespace Catch // end catch_option.hpp +#include #include #include #include @@ -4495,6 +4523,7 @@ namespace Catch { virtual int abortAfter() const = 0; virtual bool showInvisibles() const = 0; virtual ShowDurations::OrNot showDurations() const = 0; + virtual double minDuration() const = 0; virtual TestSpec const& testSpec() const = 0; virtual bool hasTestFilters() const = 0; virtual std::vector const& getTestsOrTags() const = 0; @@ -4508,6 +4537,7 @@ namespace Catch { virtual int benchmarkSamples() const = 0; virtual double benchmarkConfidenceInterval() const = 0; virtual unsigned int benchmarkResamples() const = 0; + virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0; }; using IConfigPtr = std::shared_ptr; @@ -4668,7 +4698,7 @@ class RangeGenerator final : public IGenerator { template GeneratorWrapper range(T const& start, T const& end, T const& step) { - static_assert(std::is_integral::value && !std::is_same::value, "Type must be an integer"); + static_assert(std::is_arithmetic::value && !std::is_same::value, "Type must be numeric"); return GeneratorWrapper(pf::make_unique>(start, end, step)); } @@ -5203,27 +5233,12 @@ namespace Catch { void addFilter(); bool separate(); - template - void addPattern() { - std::string token = m_patternName; - for( std::size_t i = 0; i < m_escapeChars.size(); ++i ) - token = token.substr( 0, m_escapeChars[i] - i ) + token.substr( m_escapeChars[i] -i +1 ); - m_escapeChars.clear(); - if( startsWith( token, "exclude:" ) ) { - m_exclusion = true; - token = token.substr( 8 ); - } - if( !token.empty() ) { - TestSpec::PatternPtr pattern = std::make_shared( token, m_substring ); - if( m_exclusion ) - pattern = std::make_shared( pattern ); - m_currentFilter.m_patterns.push_back( pattern ); - } - m_substring.clear(); - m_patternName.clear(); - m_exclusion = false; - m_mode = None; - } + // Handles common preprocessing of the pattern for name/tag patterns + std::string preprocessPattern(); + // Adds the current pattern as a test name + void addNamePattern(); + // Adds the current pattern as a tag + void addTagPattern(); inline void addCharToPattern(char c) { m_substring += c; @@ -5276,10 +5291,12 @@ namespace Catch { unsigned int benchmarkSamples = 100; double benchmarkConfidenceInterval = 0.95; unsigned int benchmarkResamples = 100000; + std::chrono::milliseconds::rep benchmarkWarmupTime = 100; Verbosity verbosity = Verbosity::Normal; WarnAbout::What warnings = WarnAbout::Nothing; ShowDurations::OrNot showDurations = ShowDurations::DefaultForReporter; + double minDuration = -1; RunTests::InWhatOrder runOrder = RunTests::InDeclarationOrder; UseColour::YesOrNo useColour = UseColour::Auto; WaitForKeypress::When waitForKeypress = WaitForKeypress::Never; @@ -5330,6 +5347,7 @@ namespace Catch { bool warnAboutMissingAssertions() const override; bool warnAboutNoTests() const override; ShowDurations::OrNot showDurations() const override; + double minDuration() const override; RunTests::InWhatOrder runOrder() const override; unsigned int rngSeed() const override; UseColour::YesOrNo useColour() const override; @@ -5341,6 +5359,7 @@ namespace Catch { int benchmarkSamples() const override; double benchmarkConfidenceInterval() const override; unsigned int benchmarkResamples() const override; + std::chrono::milliseconds benchmarkWarmupTime() const override; private: @@ -5446,6 +5465,8 @@ namespace Catch { } // namespace Catch // end catch_outlier_classification.hpp + +#include #endif // CATCH_CONFIG_ENABLE_BENCHMARKING #include @@ -5706,6 +5727,9 @@ namespace Catch { // Returns double formatted as %.3f (format expected on output) std::string getFormattedDuration( double duration ); + //! Should the reporter show + bool shouldShowDuration( IConfig const& config, double duration ); + std::string serializeFilters( std::vector const& container ); template @@ -6068,14 +6092,16 @@ namespace Catch { #if !defined(CATCH_CONFIG_DISABLE) #define CATCH_REGISTER_REPORTER( name, reporterType ) \ + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::ReporterRegistrar catch_internal_RegistrarFor##reporterType( name ); } \ - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #define CATCH_REGISTER_LISTENER( listenerType ) \ - CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ - namespace{ Catch::ListenerRegistrar catch_internal_RegistrarFor##listenerType; } \ - CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ + CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ + namespace{ Catch::ListenerRegistrar catch_internal_RegistrarFor##listenerType; } \ + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #else // CATCH_CONFIG_DISABLE #define CATCH_REGISTER_REPORTER(name, reporterType) @@ -6097,8 +6123,6 @@ namespace Catch { static std::string getDescription(); - ReporterPreferences getPreferences() const override; - void noMatchingTestCases(std::string const& spec) override; void assertionStarting(AssertionInfo const&) override; @@ -6198,6 +6222,14 @@ namespace Catch { #include namespace Catch { + enum class XmlFormatting { + None = 0x00, + Indent = 0x01, + Newline = 0x02, + }; + + XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs); + XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs); class XmlEncode { public: @@ -6219,14 +6251,14 @@ namespace Catch { class ScopedElement { public: - ScopedElement( XmlWriter* writer ); + ScopedElement( XmlWriter* writer, XmlFormatting fmt ); ScopedElement( ScopedElement&& other ) noexcept; ScopedElement& operator=( ScopedElement&& other ) noexcept; ~ScopedElement(); - ScopedElement& writeText( std::string const& text, bool indent = true ); + ScopedElement& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent ); template ScopedElement& writeAttribute( std::string const& name, T const& attribute ) { @@ -6236,6 +6268,7 @@ namespace Catch { private: mutable XmlWriter* m_writer = nullptr; + XmlFormatting m_fmt; }; XmlWriter( std::ostream& os = Catch::cout() ); @@ -6244,11 +6277,11 @@ namespace Catch { XmlWriter( XmlWriter const& ) = delete; XmlWriter& operator=( XmlWriter const& ) = delete; - XmlWriter& startElement( std::string const& name ); + XmlWriter& startElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - ScopedElement scopedElement( std::string const& name ); + ScopedElement scopedElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - XmlWriter& endElement(); + XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); XmlWriter& writeAttribute( std::string const& name, std::string const& attribute ); @@ -6261,9 +6294,9 @@ namespace Catch { return writeAttribute( name, rss.str() ); } - XmlWriter& writeText( std::string const& text, bool indent = true ); + XmlWriter& writeText( std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); - XmlWriter& writeComment( std::string const& text ); + XmlWriter& writeComment(std::string const& text, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent); void writeStylesheetRef( std::string const& url ); @@ -6273,6 +6306,8 @@ namespace Catch { private: + void applyFormatting(XmlFormatting fmt); + void writeDeclaration(); void newlineIfNecessary(); @@ -6316,9 +6351,10 @@ namespace Catch { void writeTestCase(TestCaseNode const& testCaseNode); - void writeSection(std::string const& className, - std::string const& rootName, - SectionNode const& sectionNode); + void writeSection( std::string const& className, + std::string const& rootName, + SectionNode const& sectionNode, + bool testOkToFail ); void writeAssertions(SectionNode const& sectionNode); void writeAssertion(AssertionStats const& stats); @@ -6394,6 +6430,12 @@ namespace Catch { #endif #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) +// start catch_benchmarking_all.hpp + +// A proxy header that includes all of the benchmarking headers to allow +// concise include of the benchmarking features. You should prefer the +// individual includes in standard use. + // start catch_benchmark.hpp // Benchmark @@ -6529,20 +6571,18 @@ namespace Catch { return {}; } }; - template - using ResultOf_t = typename std::result_of::type; // invoke and not return void :( template - CompleteType_t> complete_invoke(Fun&& fun, Args&&... args) { - return CompleteInvoker>::invoke(std::forward(fun), std::forward(args)...); + CompleteType_t> complete_invoke(Fun&& fun, Args&&... args) { + return CompleteInvoker>::invoke(std::forward(fun), std::forward(args)...); } const std::string benchmarkErrorMsg = "a benchmark failed to run successfully"; } // namespace Detail template - Detail::CompleteType_t> user_code(Fun&& fun) { + Detail::CompleteType_t> user_code(Fun&& fun) { CATCH_TRY{ return Detail::complete_invoke(std::forward(fun)); } CATCH_CATCH_ALL{ @@ -6787,8 +6827,8 @@ namespace Catch { Result result; int iterations; }; - template - using TimingOf = Timing, Detail::CompleteType_t>>; + template + using TimingOf = Timing, Detail::CompleteType_t>>; } // namespace Benchmark } // namespace Catch @@ -6799,7 +6839,7 @@ namespace Catch { namespace Benchmark { namespace Detail { template - TimingOf measure(Fun&& fun, Args&&... args) { + TimingOf measure(Fun&& fun, Args&&... args) { auto start = Clock::now(); auto&& r = Detail::complete_invoke(fun, std::forward(args)...); auto end = Clock::now(); @@ -6818,11 +6858,11 @@ namespace Catch { namespace Benchmark { namespace Detail { template - TimingOf measure_one(Fun&& fun, int iters, std::false_type) { + TimingOf measure_one(Fun&& fun, int iters, std::false_type) { return Detail::measure(fun, iters); } template - TimingOf measure_one(Fun&& fun, int iters, std::true_type) { + TimingOf measure_one(Fun&& fun, int iters, std::true_type) { Detail::ChronometerModel meter; auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters)); @@ -6839,7 +6879,7 @@ namespace Catch { }; template - TimingOf)> run_for_at_least(ClockDuration how_long, int seed, Fun&& fun) { + TimingOf> run_for_at_least(ClockDuration how_long, int seed, Fun&& fun) { auto iters = seed; while (iters < (1 << 30)) { auto&& Timing = measure_one(fun, iters, is_callable()); @@ -6849,7 +6889,7 @@ namespace Catch { } iters *= 2; } - throw optimized_away_error{}; + Catch::throw_exception(optimized_away_error{}); } } // namespace Detail } // namespace Benchmark @@ -6857,6 +6897,7 @@ namespace Catch { // end catch_run_for_at_least.hpp #include +#include namespace Catch { namespace Benchmark { @@ -6907,11 +6948,13 @@ namespace Catch { #include #include #include +#include #include #include #include #include #include +#include namespace Catch { namespace Benchmark { @@ -7025,8 +7068,8 @@ namespace Catch { double b2 = bias - z1; double a1 = a(b1); double a2 = a(b2); - auto lo = std::max(cumn(a1), 0); - auto hi = std::min(cumn(a2), n - 1); + auto lo = (std::max)(cumn(a1), 0); + auto hi = (std::min)(cumn(a2), n - 1); return { point, resample[lo], resample[hi], confidence_level }; } @@ -7095,7 +7138,9 @@ namespace Catch { } template EnvironmentEstimate> estimate_clock_cost(FloatDuration resolution) { - auto time_limit = std::min(resolution * clock_cost_estimation_tick_limit, FloatDuration(clock_cost_estimation_time_limit)); + auto time_limit = (std::min)( + resolution * clock_cost_estimation_tick_limit, + FloatDuration(clock_cost_estimation_time_limit)); auto time_clock = [](int k) { return Detail::measure([k] { for (int i = 0; i < k; ++i) { @@ -7261,10 +7306,10 @@ namespace Catch { template ExecutionPlan> prepare(const IConfig &cfg, Environment> env) const { auto min_time = env.clock_resolution.mean * Detail::minimum_ticks; - auto run_time = std::max(min_time, std::chrono::duration_cast(Detail::warmup_time)); + auto run_time = std::max(min_time, std::chrono::duration_cast(cfg.benchmarkWarmupTime())); auto&& test = Detail::run_for_at_least(std::chrono::duration_cast>(run_time), 1, fun); int new_iters = static_cast(std::ceil(min_time * test.iterations / test.elapsed)); - return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast>(Detail::warmup_time), Detail::warmup_iterations }; + return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations }; } template @@ -7296,7 +7341,7 @@ namespace Catch { }); auto analysis = Detail::analyse(*cfg, env, samples.begin(), samples.end()); - BenchmarkStats> stats{ info, analysis.samples, analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance }; + BenchmarkStats> stats{ info, analysis.samples, analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance }; getResultCapture().benchmarkEnded(stats); } CATCH_CATCH_ALL{ @@ -7337,6 +7382,77 @@ namespace Catch { BenchmarkName = [&] // end catch_benchmark.hpp +// start catch_constructor.hpp + +// Constructor and destructor helpers + + +#include + +namespace Catch { + namespace Benchmark { + namespace Detail { + template + struct ObjectStorage + { + using TStorage = typename std::aligned_storage::value>::type; + + ObjectStorage() : data() {} + + ObjectStorage(const ObjectStorage& other) + { + new(&data) T(other.stored_object()); + } + + ObjectStorage(ObjectStorage&& other) + { + new(&data) T(std::move(other.stored_object())); + } + + ~ObjectStorage() { destruct_on_exit(); } + + template + void construct(Args&&... args) + { + new (&data) T(std::forward(args)...); + } + + template + typename std::enable_if::type destruct() + { + stored_object().~T(); + } + + private: + // If this is a constructor benchmark, destruct the underlying object + template + void destruct_on_exit(typename std::enable_if::type* = 0) { destruct(); } + // Otherwise, don't + template + void destruct_on_exit(typename std::enable_if::type* = 0) { } + + T& stored_object() { + return *static_cast(static_cast(&data)); + } + + T const& stored_object() const { + return *static_cast(static_cast(&data)); + } + + TStorage data; + }; + } + + template + using storage_for = Detail::ObjectStorage; + + template + using destructable_object = Detail::ObjectStorage; + } +} + +// end catch_constructor.hpp +// end catch_benchmarking_all.hpp #endif #endif // ! CATCH_CONFIG_IMPL_ONLY @@ -7364,23 +7480,37 @@ namespace TestCaseTracking { SourceLineInfo location; NameAndLocation( std::string const& _name, SourceLineInfo const& _location ); + friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) { + return lhs.name == rhs.name + && lhs.location == rhs.location; + } }; - struct ITracker; + class ITracker; using ITrackerPtr = std::shared_ptr; - struct ITracker { - virtual ~ITracker(); + class ITracker { + NameAndLocation m_nameAndLocation; + + public: + ITracker(NameAndLocation const& nameAndLoc) : + m_nameAndLocation(nameAndLoc) + {} // static queries - virtual NameAndLocation const& nameAndLocation() const = 0; + NameAndLocation const& nameAndLocation() const { + return m_nameAndLocation; + } + + virtual ~ITracker(); // dynamic queries virtual bool isComplete() const = 0; // Successfully completed or failed virtual bool isSuccessfullyCompleted() const = 0; virtual bool isOpen() const = 0; // Started but not complete virtual bool hasChildren() const = 0; + virtual bool hasStarted() const = 0; virtual ITracker& parent() = 0; @@ -7435,7 +7565,6 @@ namespace TestCaseTracking { }; using Children = std::vector; - NameAndLocation m_nameAndLocation; TrackerContext& m_ctx; ITracker* m_parent; Children m_children; @@ -7444,11 +7573,13 @@ namespace TestCaseTracking { public: TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ); - NameAndLocation const& nameAndLocation() const override; bool isComplete() const override; bool isSuccessfullyCompleted() const override; bool isOpen() const override; bool hasChildren() const override; + bool hasStarted() const override { + return m_runState != NotStarted; + } void addChild( ITrackerPtr const& child ) override; @@ -7487,6 +7618,10 @@ namespace TestCaseTracking { void addInitialFilters( std::vector const& filters ); void addNextFilters( std::vector const& filters ); + //! Returns filters active in this tracker + std::vector const& getFilters() const; + //! Returns whitespace-trimmed name of the tracked section + std::string const& trimmedName() const; }; } // namespace TestCaseTracking @@ -7652,7 +7787,7 @@ namespace Catch { double sb = stddev.point; double mn = mean.point / n; double mg_min = mn / 2.; - double sg = std::min(mg_min / 4., sb / std::sqrt(n)); + double sg = (std::min)(mg_min / 4., sb / std::sqrt(n)); double sg2 = sg * sg; double sb2 = sb * sb; @@ -7671,13 +7806,14 @@ namespace Catch { return (nc / n) * (sb2 - nc * sg2); }; - return std::min(var_out(1), var_out(std::min(c_max(0.), c_max(mg_min)))) / sb2; + return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) / sb2; } bootstrap_analysis analyse_samples(double confidence_level, int n_resamples, std::vector::iterator first, std::vector::iterator last) { + CATCH_INTERNAL_START_WARNINGS_SUPPRESSION CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS static std::random_device entropy; - CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS + CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION auto n = static_cast(last - first); // seriously, one can't use integral types without hell in C++ @@ -7810,7 +7946,24 @@ namespace Catch { #ifdef CATCH_PLATFORM_MAC - #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */ + #if defined(__i386__) || defined(__x86_64__) + #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */ + #elif defined(__aarch64__) + #define CATCH_TRAP() __asm__(".inst 0xd4200000") + #endif + +#elif defined(CATCH_PLATFORM_IPHONE) + + // use inline assembler + #if defined(__i386__) || defined(__x86_64__) + #define CATCH_TRAP() __asm__("int $3") + #elif defined(__aarch64__) + #define CATCH_TRAP() __asm__(".inst 0xd4200000") + #elif defined(__arm__) && !defined(__thumb__) + #define CATCH_TRAP() __asm__(".inst 0xe7f001f0") + #elif defined(__arm__) && defined(__thumb__) + #define CATCH_TRAP() __asm__(".inst 0xde01") + #endif #elif defined(CATCH_PLATFORM_LINUX) // If we can use inline assembler, do it because this allows us to break @@ -7830,10 +7983,12 @@ namespace Catch { #define CATCH_TRAP() DebugBreak() #endif -#ifdef CATCH_TRAP - #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }() -#else - #define CATCH_BREAK_INTO_DEBUGGER() []{}() +#ifndef CATCH_BREAK_INTO_DEBUGGER + #ifdef CATCH_TRAP + #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }() + #else + #define CATCH_BREAK_INTO_DEBUGGER() []{}() + #endif #endif // end catch_debugger.h @@ -7841,86 +7996,58 @@ namespace Catch { // start catch_fatal_condition.h -// start catch_windows_h_proxy.h - - -#if defined(CATCH_PLATFORM_WINDOWS) - -#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX) -# define CATCH_DEFINED_NOMINMAX -# define NOMINMAX -#endif -#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN) -# define CATCH_DEFINED_WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -#endif - -#ifdef __AFXDLL -#include -#else -#include -#endif - -#ifdef CATCH_DEFINED_NOMINMAX -# undef NOMINMAX -#endif -#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN -# undef WIN32_LEAN_AND_MEAN -#endif - -#endif // defined(CATCH_PLATFORM_WINDOWS) - -// end catch_windows_h_proxy.h -#if defined( CATCH_CONFIG_WINDOWS_SEH ) +#include namespace Catch { - struct FatalConditionHandler { - - static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo); + // Wrapper for platform-specific fatal error (signals/SEH) handlers + // + // Tries to be cooperative with other handlers, and not step over + // other handlers. This means that unknown structured exceptions + // are passed on, previous signal handlers are called, and so on. + // + // Can only be instantiated once, and assumes that once a signal + // is caught, the binary will end up terminating. Thus, there + class FatalConditionHandler { + bool m_started = false; + + // Install/disengage implementation for specific platform. + // Should be if-defed to work on current platform, can assume + // engage-disengage 1:1 pairing. + void engage_platform(); + void disengage_platform(); + public: + // Should also have platform-specific implementations as needed FatalConditionHandler(); - static void reset(); ~FatalConditionHandler(); - private: - static bool isSet; - static ULONG guaranteeSize; - static PVOID exceptionHandlerHandle; - }; - -} // namespace Catch - -#elif defined ( CATCH_CONFIG_POSIX_SIGNALS ) - -#include - -namespace Catch { - - struct FatalConditionHandler { - - static bool isSet; - static struct sigaction oldSigActions[]; - static stack_t oldSigStack; - static char altStackMem[]; - - static void handleSignal( int sig ); + void engage() { + assert(!m_started && "Handler cannot be installed twice."); + m_started = true; + engage_platform(); + } - FatalConditionHandler(); - ~FatalConditionHandler(); - static void reset(); + void disengage() { + assert(m_started && "Handler cannot be uninstalled without being installed first"); + m_started = false; + disengage_platform(); + } }; -} // namespace Catch - -#else - -namespace Catch { - struct FatalConditionHandler { - void reset(); + //! Simple RAII guard for (dis)engaging the FatalConditionHandler + class FatalConditionHandlerGuard { + FatalConditionHandler* m_handler; + public: + FatalConditionHandlerGuard(FatalConditionHandler* handler): + m_handler(handler) { + m_handler->engage(); + } + ~FatalConditionHandlerGuard() { + m_handler->disengage(); + } }; -} -#endif +} // end namespace Catch // end catch_fatal_condition.h #include @@ -7980,7 +8107,7 @@ namespace Catch { void sectionEnded( SectionEndInfo const& endInfo ) override; void sectionEndedEarly( SectionEndInfo const& endInfo ) override; - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override; + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override; #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) void benchmarkPreparing( std::string const& name ) override; @@ -8046,6 +8173,7 @@ namespace Catch { std::vector m_unfinishedSections; std::vector m_activeSections; TrackerContext m_trackerContext; + FatalConditionHandler m_fatalConditionhandler; bool m_lastAssertionPassed = false; bool m_shouldReportUnexpected = true; bool m_includeSuccessfulResults; @@ -8956,7 +9084,7 @@ namespace detail { } inline auto convertInto( std::string const &source, bool &target ) -> ParserResult { std::string srcLC = source; - std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( char c ) { return static_cast( std::tolower(c) ); } ); + std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( unsigned char c ) { return static_cast( std::tolower(c) ); } ); if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on") target = true; else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off") @@ -9605,8 +9733,7 @@ namespace Catch { if( !startsWith( line, '"' ) ) line = '"' + line + '"'; config.testsOrTags.push_back( line ); - config.testsOrTags.push_back( "," ); - + config.testsOrTags.emplace_back( "," ); } } //Remove comma in the end @@ -9647,14 +9774,16 @@ namespace Catch { }; auto const setWaitForKeypress = [&]( std::string const& keypress ) { auto keypressLc = toLower( keypress ); - if( keypressLc == "start" ) + if (keypressLc == "never") + config.waitForKeypress = WaitForKeypress::Never; + else if( keypressLc == "start" ) config.waitForKeypress = WaitForKeypress::BeforeStart; else if( keypressLc == "exit" ) config.waitForKeypress = WaitForKeypress::BeforeExit; else if( keypressLc == "both" ) config.waitForKeypress = WaitForKeypress::BeforeStartAndExit; else - return ParserResult::runtimeError( "keypress argument must be one of: start, exit or both. '" + keypress + "' not recognised" ); + return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" ); return ParserResult::ok( ParseResultType::Matched ); }; auto const setVerbosity = [&]( std::string const& verbosity ) { @@ -9724,6 +9853,9 @@ namespace Catch { | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" ) ["-d"]["--durations"] ( "show test durations" ) + | Opt( config.minDuration, "seconds" ) + ["-D"]["--min-duration"] + ( "show test durations for tests taking at least the given number of seconds" ) | Opt( loadTestNamesFromFile, "filename" ) ["-f"]["--input-file"] ( "load test names to run from a file" ) @@ -9754,7 +9886,7 @@ namespace Catch { | Opt( config.libIdentify ) ["--libidentify"] ( "report name and version according to libidentify standard" ) - | Opt( setWaitForKeypress, "start|exit|both" ) + | Opt( setWaitForKeypress, "never|start|exit|both" ) ["--wait-for-keypress"] ( "waits for a keypress before exiting" ) | Opt( config.benchmarkSamples, "samples" ) @@ -9769,7 +9901,10 @@ namespace Catch { | Opt( config.benchmarkNoAnalysis ) ["--benchmark-no-analysis"] ( "perform only measurements; do not perform any analysis" ) - | Arg( config.testsOrTags, "test name|pattern|tags" ) + | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" ) + ["--benchmark-warmup-time"] + ( "amount of time in milliseconds spent on warming up each test (default: 100)" ) + | Arg( config.testsOrTags, "test name|pattern|tags" ) ( "which test or tests to use" ); return cli; @@ -9868,6 +10003,7 @@ namespace Catch { bool Config::warnAboutMissingAssertions() const { return !!(m_data.warnings & WarnAbout::NoAssertions); } bool Config::warnAboutNoTests() const { return !!(m_data.warnings & WarnAbout::NoTests); } ShowDurations::OrNot Config::showDurations() const { return m_data.showDurations; } + double Config::minDuration() const { return m_data.minDuration; } RunTests::InWhatOrder Config::runOrder() const { return m_data.runOrder; } unsigned int Config::rngSeed() const { return m_data.rngSeed; } UseColour::YesOrNo Config::useColour() const { return m_data.useColour; } @@ -9876,10 +10012,11 @@ namespace Catch { bool Config::showInvisibles() const { return m_data.showInvisibles; } Verbosity Config::verbosity() const { return m_data.verbosity; } - bool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; } - int Config::benchmarkSamples() const { return m_data.benchmarkSamples; } - double Config::benchmarkConfidenceInterval() const { return m_data.benchmarkConfidenceInterval; } - unsigned int Config::benchmarkResamples() const { return m_data.benchmarkResamples; } + bool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; } + int Config::benchmarkSamples() const { return m_data.benchmarkSamples; } + double Config::benchmarkConfidenceInterval() const { return m_data.benchmarkConfidenceInterval; } + unsigned int Config::benchmarkResamples() const { return m_data.benchmarkResamples; } + std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); } IStream const* Config::openStream() { return Catch::makeStream(m_data.outputFilename); @@ -9909,24 +10046,54 @@ namespace Catch { } // end catch_errno_guard.h -#include +// start catch_windows_h_proxy.h -namespace Catch { - namespace { - struct IColourImpl { - virtual ~IColourImpl() = default; - virtual void use( Colour::Code _colourCode ) = 0; - }; +#if defined(CATCH_PLATFORM_WINDOWS) - struct NoColourImpl : IColourImpl { - void use( Colour::Code ) {} +#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX) +# define CATCH_DEFINED_NOMINMAX +# define NOMINMAX +#endif +#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN) +# define CATCH_DEFINED_WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif - static IColourImpl* instance() { - static NoColourImpl s_instance; - return &s_instance; - } - }; +#ifdef __AFXDLL +#include +#else +#include +#endif + +#ifdef CATCH_DEFINED_NOMINMAX +# undef NOMINMAX +#endif +#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN +# undef WIN32_LEAN_AND_MEAN +#endif + +#endif // defined(CATCH_PLATFORM_WINDOWS) + +// end catch_windows_h_proxy.h +#include + +namespace Catch { + namespace { + + struct IColourImpl { + virtual ~IColourImpl() = default; + virtual void use( Colour::Code _colourCode ) = 0; + }; + + struct NoColourImpl : IColourImpl { + void use( Colour::Code ) override {} + + static IColourImpl* instance() { + static NoColourImpl s_instance; + return &s_instance; + } + }; } // anon namespace } // namespace Catch @@ -10052,7 +10219,7 @@ namespace { bool useColourOnPlatform() { return -#ifdef CATCH_PLATFORM_MAC +#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE) !isDebuggerActive() && #endif #if !(defined(__DJGPP__) && defined(__STRICT_ANSI__)) @@ -10093,13 +10260,13 @@ namespace Catch { namespace Catch { Colour::Colour( Code _colourCode ) { use( _colourCode ); } - Colour::Colour( Colour&& rhs ) noexcept { - m_moved = rhs.m_moved; - rhs.m_moved = true; + Colour::Colour( Colour&& other ) noexcept { + m_moved = other.m_moved; + other.m_moved = true; } - Colour& Colour::operator=( Colour&& rhs ) noexcept { - m_moved = rhs.m_moved; - rhs.m_moved = true; + Colour& Colour::operator=( Colour&& other ) noexcept { + m_moved = other.m_moved; + other.m_moved = true; return *this; } @@ -10111,7 +10278,7 @@ namespace Catch { // However, under some conditions it does happen (see #1626), // and this change is small enough that we can let practicality // triumph over purity in this case. - if (impl != NULL) { + if (impl != nullptr) { impl->use( _colourCode ); } } @@ -10229,10 +10396,9 @@ namespace Catch { // end catch_debug_console.cpp // start catch_debugger.cpp -#ifdef CATCH_PLATFORM_MAC +#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE) -# include -# include +# include # include # include # include @@ -10426,7 +10592,7 @@ namespace Catch { // Extracts the actual name part of an enum instance // In other words, it returns the Blue part of Bikeshed::Colour::Blue StringRef extractInstanceName(StringRef enumInstance) { - // Find last occurence of ":" + // Find last occurrence of ":" size_t name_start = enumInstance.size(); while (name_start > 0 && enumInstance[name_start - 1] != ':') { --name_start; @@ -10464,7 +10630,7 @@ namespace Catch { assert( valueNames.size() == values.size() ); std::size_t i = 0; for( auto value : values ) - enumInfo->m_values.push_back({ value, valueNames[i++] }); + enumInfo->m_values.emplace_back(value, valueNames[i++]); return enumInfo; } @@ -10588,25 +10754,47 @@ namespace Catch { // end catch_exception_translator_registry.cpp // start catch_fatal_condition.cpp -#if defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wmissing-field-initializers" -#endif +#include + +#if !defined( CATCH_CONFIG_WINDOWS_SEH ) && !defined( CATCH_CONFIG_POSIX_SIGNALS ) + +namespace Catch { + + // If neither SEH nor signal handling is required, the handler impls + // do not have to do anything, and can be empty. + void FatalConditionHandler::engage_platform() {} + void FatalConditionHandler::disengage_platform() {} + FatalConditionHandler::FatalConditionHandler() = default; + FatalConditionHandler::~FatalConditionHandler() = default; + +} // end namespace Catch + +#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS + +#if defined( CATCH_CONFIG_WINDOWS_SEH ) && defined( CATCH_CONFIG_POSIX_SIGNALS ) +#error "Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time" +#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS #if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS ) namespace { - // Report the error condition + //! Signals fatal error message to the run context void reportFatal( char const * const message ) { Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message ); } -} -#endif // signals/SEH handling + //! Minimal size Catch2 needs for its own fatal error handling. + //! Picked anecdotally, so it might not be sufficient on all + //! platforms, and for all configurations. + constexpr std::size_t minStackSizeForErrors = 32 * 1024; +} // end unnamed namespace + +#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS #if defined( CATCH_CONFIG_WINDOWS_SEH ) namespace Catch { + struct SignalDefs { DWORD id; const char* name; }; // There is no 1-1 mapping between signals and windows exceptions. @@ -10619,7 +10807,7 @@ namespace Catch { { static_cast(EXCEPTION_INT_DIVIDE_BY_ZERO), "Divide by zero error" }, }; - LONG CALLBACK FatalConditionHandler::handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) { + static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) { for (auto const& def : signalDefs) { if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) { reportFatal(def.name); @@ -10630,38 +10818,50 @@ namespace Catch { return EXCEPTION_CONTINUE_SEARCH; } + // Since we do not support multiple instantiations, we put these + // into global variables and rely on cleaning them up in outlined + // constructors/destructors + static PVOID exceptionHandlerHandle = nullptr; + + // For MSVC, we reserve part of the stack memory for handling + // memory overflow structured exception. FatalConditionHandler::FatalConditionHandler() { - isSet = true; - // 32k seems enough for Catch to handle stack overflow, - // but the value was found experimentally, so there is no strong guarantee - guaranteeSize = 32 * 1024; - exceptionHandlerHandle = nullptr; + ULONG guaranteeSize = static_cast(minStackSizeForErrors); + if (!SetThreadStackGuarantee(&guaranteeSize)) { + // We do not want to fully error out, because needing + // the stack reserve should be rare enough anyway. + Catch::cerr() + << "Failed to reserve piece of stack." + << " Stack overflows will not be reported successfully."; + } + } + + // We do not attempt to unset the stack guarantee, because + // Windows does not support lowering the stack size guarantee. + FatalConditionHandler::~FatalConditionHandler() = default; + + void FatalConditionHandler::engage_platform() { // Register as first handler in current chain exceptionHandlerHandle = AddVectoredExceptionHandler(1, handleVectoredException); - // Pass in guarantee size to be filled - SetThreadStackGuarantee(&guaranteeSize); + if (!exceptionHandlerHandle) { + CATCH_RUNTIME_ERROR("Could not register vectored exception handler"); + } } - void FatalConditionHandler::reset() { - if (isSet) { - RemoveVectoredExceptionHandler(exceptionHandlerHandle); - SetThreadStackGuarantee(&guaranteeSize); - exceptionHandlerHandle = nullptr; - isSet = false; + void FatalConditionHandler::disengage_platform() { + if (!RemoveVectoredExceptionHandler(exceptionHandlerHandle)) { + CATCH_RUNTIME_ERROR("Could not unregister vectored exception handler"); } + exceptionHandlerHandle = nullptr; } - FatalConditionHandler::~FatalConditionHandler() { - reset(); - } +} // end namespace Catch -bool FatalConditionHandler::isSet = false; -ULONG FatalConditionHandler::guaranteeSize = 0; -PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr; +#endif // CATCH_CONFIG_WINDOWS_SEH -} // namespace Catch +#if defined( CATCH_CONFIG_POSIX_SIGNALS ) -#elif defined( CATCH_CONFIG_POSIX_SIGNALS ) +#include namespace Catch { @@ -10670,10 +10870,6 @@ namespace Catch { const char* name; }; - // 32kb for the alternate stack seems to be sufficient. However, this value - // is experimentally determined, so that's not guaranteed. - static constexpr std::size_t sigStackSize = 32768 >= MINSIGSTKSZ ? 32768 : MINSIGSTKSZ; - static SignalDefs signalDefs[] = { { SIGINT, "SIGINT - Terminal interrupt signal" }, { SIGILL, "SIGILL - Illegal instruction signal" }, @@ -10683,7 +10879,32 @@ namespace Catch { { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" } }; - void FatalConditionHandler::handleSignal( int sig ) { +// Older GCCs trigger -Wmissing-field-initializers for T foo = {} +// which is zero initialization, but not explicit. We want to avoid +// that. +#if defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#endif + + static char* altStackMem = nullptr; + static std::size_t altStackSize = 0; + static stack_t oldSigStack{}; + static struct sigaction oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{}; + + static void restorePreviousSignalHandlers() { + // We set signal handlers back to the previous ones. Hopefully + // nobody overwrote them in the meantime, and doesn't expect + // their signal handlers to live past ours given that they + // installed them after ours.. + for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) { + sigaction(signalDefs[i].id, &oldSigActions[i], nullptr); + } + // Return the old stack + sigaltstack(&oldSigStack, nullptr); + } + + static void handleSignal( int sig ) { char const * name = ""; for (auto const& def : signalDefs) { if (sig == def.id) { @@ -10691,16 +10912,33 @@ namespace Catch { break; } } - reset(); - reportFatal(name); + // We need to restore previous signal handlers and let them do + // their thing, so that the users can have the debugger break + // when a signal is raised, and so on. + restorePreviousSignalHandlers(); + reportFatal( name ); raise( sig ); } FatalConditionHandler::FatalConditionHandler() { - isSet = true; + assert(!altStackMem && "Cannot initialize POSIX signal handler when one already exists"); + if (altStackSize == 0) { + altStackSize = std::max(static_cast(SIGSTKSZ), minStackSizeForErrors); + } + altStackMem = new char[altStackSize](); + } + + FatalConditionHandler::~FatalConditionHandler() { + delete[] altStackMem; + // We signal that another instance can be constructed by zeroing + // out the pointer. + altStackMem = nullptr; + } + + void FatalConditionHandler::engage_platform() { stack_t sigStack; sigStack.ss_sp = altStackMem; - sigStack.ss_size = sigStackSize; + sigStack.ss_size = altStackSize; sigStack.ss_flags = 0; sigaltstack(&sigStack, &oldSigStack); struct sigaction sa = { }; @@ -10712,40 +10950,17 @@ namespace Catch { } } - FatalConditionHandler::~FatalConditionHandler() { - reset(); - } +#if defined(__GNUC__) +# pragma GCC diagnostic pop +#endif - void FatalConditionHandler::reset() { - if( isSet ) { - // Set signals back to previous values -- hopefully nobody overwrote them in the meantime - for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i ) { - sigaction(signalDefs[i].id, &oldSigActions[i], nullptr); - } - // Return the old stack - sigaltstack(&oldSigStack, nullptr); - isSet = false; - } + void FatalConditionHandler::disengage_platform() { + restorePreviousSignalHandlers(); } - bool FatalConditionHandler::isSet = false; - struct sigaction FatalConditionHandler::oldSigActions[sizeof(signalDefs)/sizeof(SignalDefs)] = {}; - stack_t FatalConditionHandler::oldSigStack = {}; - char FatalConditionHandler::altStackMem[sigStackSize] = {}; - -} // namespace Catch - -#else - -namespace Catch { - void FatalConditionHandler::reset() {} -} - -#endif // signals/SEH handling +} // end namespace Catch -#if defined(__GNUC__) -# pragma GCC diagnostic pop -#endif +#endif // CATCH_CONFIG_POSIX_SIGNALS // end catch_fatal_condition.cpp // start catch_generators.cpp @@ -10764,8 +10979,8 @@ namespace Generators { GeneratorUntypedBase::~GeneratorUntypedBase() {} - auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { - return getResultCapture().acquireGeneratorTracker( lineInfo ); + auto acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { + return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo ); } } // namespace Generators @@ -11040,7 +11255,7 @@ namespace Catch { namespace Catch { std::size_t listTests( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); if( config.hasTestFilters() ) Catch::cout() << "Matching test cases:\n"; else { @@ -11074,7 +11289,7 @@ namespace Catch { } std::size_t listTestsNamesOnly( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); std::size_t matchedTests = 0; std::vector matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config ); for( auto const& testCaseInfo : matchedTestCases ) { @@ -11112,7 +11327,7 @@ namespace Catch { } std::size_t listTags( Config const& config ) { - TestSpec testSpec = config.testSpec(); + TestSpec const& testSpec = config.testSpec(); if( config.hasTestFilters() ) Catch::cout() << "Tags for matching test cases:\n"; else { @@ -11300,20 +11515,13 @@ namespace { return lhs == rhs; } - auto ulpDiff = std::abs(lc - rc); + // static cast as a workaround for IBM XLC + auto ulpDiff = std::abs(static_cast(lc - rc)); return static_cast(ulpDiff) <= maxUlpDiff; } -} //end anonymous namespace - #if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) -#if defined(__clang__) -#pragma clang diagnostic push -// The long double overload is currently unused -#pragma clang diagnostic ignored "-Wunused-function" -#endif - float nextafter(float x, float y) { return ::nextafterf(x, y); } @@ -11322,18 +11530,8 @@ namespace { return ::nextafter(x, y); } - long double nextafter(long double x, long double y) { - return ::nextafterl(x, y); - } - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - #endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^ -namespace { - template FP step(FP start, FP direction, uint64_t steps) { for (uint64_t i = 0; i < steps; ++i) { @@ -11431,9 +11629,10 @@ namespace Floating { ret << ", "; write(ret, step(m_target, static_cast( INFINITY), m_ulps)); } else { - write(ret, step(static_cast(m_target), -INFINITY, m_ulps)); + // We have to cast INFINITY to float because of MinGW, see #1782 + write(ret, step(static_cast(m_target), static_cast(-INFINITY), m_ulps)); ret << ", "; - write(ret, step(static_cast(m_target), INFINITY, m_ulps)); + write(ret, step(static_cast(m_target), static_cast( INFINITY), m_ulps)); } ret << "])"; @@ -11491,7 +11690,6 @@ Floating::WithinRelMatcher WithinRel(float target) { } // namespace Matchers } // namespace Catch - // end catch_matchers_floating.cpp // start catch_matchers_generic.cpp @@ -11669,10 +11867,10 @@ namespace Catch { Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) { auto trimmed = [&] (size_t start, size_t end) { - while (names[start] == ',' || isspace(names[start])) { + while (names[start] == ',' || isspace(static_cast(names[start]))) { ++start; } - while (names[end] == ',' || isspace(names[end])) { + while (names[end] == ',' || isspace(static_cast(names[end]))) { --end; } return names.substr(start, end - start + 1); @@ -11711,7 +11909,7 @@ namespace Catch { pos = skipq(pos, c); break; case ',': - if (start != pos && openings.size() == 0) { + if (start != pos && openings.empty()) { m_messages.emplace_back(macroName, lineInfo, resultType); m_messages.back().message = static_cast(trimmed(start, pos)); m_messages.back().message += " := "; @@ -11719,7 +11917,7 @@ namespace Catch { } } } - assert(openings.size() == 0 && "Mismatched openings"); + assert(openings.empty() && "Mismatched openings"); m_messages.emplace_back(macroName, lineInfo, resultType); m_messages.back().message = static_cast(trimmed(start, names.size() - 1)); m_messages.back().message += " := "; @@ -11907,7 +12105,7 @@ namespace Catch { if (tmpnam_s(m_buffer)) { CATCH_RUNTIME_ERROR("Could not get a temp filename"); } - if (fopen_s(&m_file, m_buffer, "w")) { + if (fopen_s(&m_file, m_buffer, "w+")) { char buffer[100]; if (strerror_s(buffer, errno)) { CATCH_RUNTIME_ERROR("Could not translate errno to a string"); @@ -12202,11 +12400,13 @@ namespace Catch { namespace Catch { class StartupExceptionRegistry { +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) public: void add(std::exception_ptr const& exception) noexcept; std::vector const& getExceptions() const noexcept; private: std::vector m_exceptions; +#endif }; } // end namespace Catch @@ -12289,7 +12489,11 @@ namespace Catch { m_tagAliasRegistry.add( alias, tag, lineInfo ); } void registerStartupException() noexcept override { +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) m_exceptionRegistry.add(std::current_exception()); +#else + CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!"); +#endif } IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override { return m_enumValuesRegistry; @@ -12393,17 +12597,32 @@ namespace Catch { std::shared_ptr tracker; ITracker& currentTracker = ctx.currentTracker(); - if( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) { + // Under specific circumstances, the generator we want + // to acquire is also the current tracker. If this is + // the case, we have to avoid looking through current + // tracker's children, and instead return the current + // tracker. + // A case where this check is important is e.g. + // for (int i = 0; i < 5; ++i) { + // int n = GENERATE(1, 2); + // } + // + // without it, the code above creates 5 nested generators. + if (currentTracker.nameAndLocation() == nameAndLocation) { + auto thisTracker = currentTracker.parent().findChild(nameAndLocation); + assert(thisTracker); + assert(thisTracker->isGeneratorTracker()); + tracker = std::static_pointer_cast(thisTracker); + } else if ( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) { assert( childTracker ); assert( childTracker->isGeneratorTracker() ); tracker = std::static_pointer_cast( childTracker ); - } - else { + } else { tracker = std::make_shared( nameAndLocation, ctx, ¤tTracker ); currentTracker.addChild( tracker ); } - if( !ctx.completedCycle() && !tracker->isComplete() ) { + if( !tracker->isComplete() ) { tracker->open(); } @@ -12417,8 +12636,68 @@ namespace Catch { } void close() override { TrackerBase::close(); - // Generator interface only finds out if it has another item on atual move - if (m_runState == CompletedSuccessfully && m_generator->next()) { + // If a generator has a child (it is followed by a section) + // and none of its children have started, then we must wait + // until later to start consuming its values. + // This catches cases where `GENERATE` is placed between two + // `SECTION`s. + // **The check for m_children.empty cannot be removed**. + // doing so would break `GENERATE` _not_ followed by `SECTION`s. + const bool should_wait_for_child = [&]() { + // No children -> nobody to wait for + if ( m_children.empty() ) { + return false; + } + // If at least one child started executing, don't wait + if ( std::find_if( + m_children.begin(), + m_children.end(), + []( TestCaseTracking::ITrackerPtr tracker ) { + return tracker->hasStarted(); + } ) != m_children.end() ) { + return false; + } + + // No children have started. We need to check if they _can_ + // start, and thus we should wait for them, or they cannot + // start (due to filters), and we shouldn't wait for them + auto* parent = m_parent; + // This is safe: there is always at least one section + // tracker in a test case tracking tree + while ( !parent->isSectionTracker() ) { + parent = &( parent->parent() ); + } + assert( parent && + "Missing root (test case) level section" ); + + auto const& parentSection = + static_cast( *parent ); + auto const& filters = parentSection.getFilters(); + // No filters -> no restrictions on running sections + if ( filters.empty() ) { + return true; + } + + for ( auto const& child : m_children ) { + if ( child->isSectionTracker() && + std::find( filters.begin(), + filters.end(), + static_cast( *child ) + .trimmedName() ) != + filters.end() ) { + return true; + } + } + return false; + }(); + + // This check is a bit tricky, because m_generator->next() + // has a side-effect, where it consumes generator's current + // value, but we do not want to invoke the side-effect if + // this generator is still waiting for any child to start. + if ( should_wait_for_child || + ( m_runState == CompletedSuccessfully && + m_generator->next() ) ) { m_children.clear(); m_runState = Executing; } @@ -12554,10 +12833,10 @@ namespace Catch { return true; } - auto RunContext::acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { + auto RunContext::acquireGeneratorTracker( StringRef generatorName, SourceLineInfo const& lineInfo ) -> IGeneratorTracker& { using namespace Generators; - GeneratorTracker& tracker = GeneratorTracker::acquire( m_trackerContext, TestCaseTracking::NameAndLocation( "generator", lineInfo ) ); - assert( tracker.isOpen() ); + GeneratorTracker& tracker = GeneratorTracker::acquire(m_trackerContext, + TestCaseTracking::NameAndLocation( static_cast(generatorName), lineInfo ) ); m_lastAssertionInfo.lineInfo = lineInfo; return tracker; } @@ -12600,17 +12879,17 @@ namespace Catch { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) void RunContext::benchmarkPreparing(std::string const& name) { - m_reporter->benchmarkPreparing(name); - } + m_reporter->benchmarkPreparing(name); + } void RunContext::benchmarkStarting( BenchmarkInfo const& info ) { m_reporter->benchmarkStarting( info ); } void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) { m_reporter->benchmarkEnded( stats ); } - void RunContext::benchmarkFailed(std::string const & error) { - m_reporter->benchmarkFailed(error); - } + void RunContext::benchmarkFailed(std::string const & error) { + m_reporter->benchmarkFailed(error); + } #endif // CATCH_CONFIG_ENABLE_BENCHMARKING void RunContext::pushScopedMessage(MessageInfo const & message) { @@ -12744,9 +13023,8 @@ namespace Catch { } void RunContext::invokeActiveTestCase() { - FatalConditionHandler fatalConditionHandler; // Handle signals + FatalConditionHandlerGuard _(&m_fatalConditionhandler); m_activeTestCase->invoke(); - fatalConditionHandler.reset(); } void RunContext::handleUnfinishedSections() { @@ -13114,6 +13392,10 @@ namespace Catch { filename.erase(0, lastSlash); filename[0] = '#'; } + else + { + filename.insert(0, "#"); + } auto lastDot = filename.find_last_of('.'); if (lastDot != std::string::npos) { @@ -13207,11 +13489,11 @@ namespace Catch { char **utf8Argv = new char *[ argc ]; for ( int i = 0; i < argc; ++i ) { - int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, NULL, 0, NULL, NULL ); + int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr ); utf8Argv[ i ] = new char[ bufSize ]; - WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, NULL, NULL ); + WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr ); } int returnCode = applyCommandLine( argc, utf8Argv ); @@ -13331,6 +13613,7 @@ namespace Catch { // end catch_singletons.cpp // start catch_startup_exception_registry.cpp +#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) namespace Catch { void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept { CATCH_TRY { @@ -13346,6 +13629,7 @@ void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexce } } // end namespace Catch +#endif // end catch_startup_exception_registry.cpp // start catch_stream.cpp @@ -13530,7 +13814,7 @@ namespace Catch { namespace { char toLowerCh(char c) { - return static_cast( std::tolower( c ) ); + return static_cast( std::tolower( static_cast(c) ) ); } } @@ -13622,11 +13906,7 @@ namespace Catch { // end catch_string_manip.cpp // start catch_stringref.cpp -#if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wexit-time-destructors" -#endif - +#include #include #include #include @@ -13636,66 +13916,36 @@ namespace Catch { : StringRef( rawChars, static_cast(std::strlen(rawChars) ) ) {} - void StringRef::swap( StringRef& other ) noexcept { - std::swap( m_start, other.m_start ); - std::swap( m_size, other.m_size ); - std::swap( m_data, other.m_data ); - } - auto StringRef::c_str() const -> char const* { - if( !isSubstring() ) - return m_start; - - const_cast( this )->takeOwnership(); - return m_data; - } - auto StringRef::currentData() const noexcept -> char const* { + CATCH_ENFORCE(isNullTerminated(), "Called StringRef::c_str() on a non-null-terminated instance"); return m_start; } - - auto StringRef::isOwned() const noexcept -> bool { - return m_data != nullptr; - } - auto StringRef::isSubstring() const noexcept -> bool { - return m_start[m_size] != '\0'; + auto StringRef::data() const noexcept -> char const* { + return m_start; } - void StringRef::takeOwnership() { - if( !isOwned() ) { - m_data = new char[m_size+1]; - memcpy( m_data, m_start, m_size ); - m_data[m_size] = '\0'; - } - } auto StringRef::substr( size_type start, size_type size ) const noexcept -> StringRef { - if( start < m_size ) - return StringRef( m_start+start, size ); - else + if (start < m_size) { + return StringRef(m_start + start, (std::min)(m_size - start, size)); + } else { return StringRef(); + } } auto StringRef::operator == ( StringRef const& other ) const noexcept -> bool { - return - size() == other.size() && - (std::strncmp( m_start, other.m_start, size() ) == 0); - } - auto StringRef::operator != ( StringRef const& other ) const noexcept -> bool { - return !operator==( other ); + return m_size == other.m_size + && (std::memcmp( m_start, other.m_start, m_size ) == 0); } auto operator << ( std::ostream& os, StringRef const& str ) -> std::ostream& { - return os.write(str.currentData(), str.size()); + return os.write(str.data(), str.size()); } auto operator+=( std::string& lhs, StringRef const& rhs ) -> std::string& { - lhs.append(rhs.currentData(), rhs.size()); + lhs.append(rhs.data(), rhs.size()); return lhs; } } // namespace Catch - -#if defined(__clang__) -# pragma clang diagnostic pop -#endif // end catch_stringref.cpp // start catch_tag_alias.cpp @@ -13844,7 +14094,8 @@ namespace Catch { } } if( isHidden ) { - tags.push_back( "." ); + // Add all "hidden" tags to make them behave identically + tags.insert( tags.end(), { ".", "!hide" } ); } TestCaseInfo info( static_cast(nameAndTags.name), _className, desc, tags, _lineInfo ); @@ -13939,27 +14190,81 @@ namespace Catch { // end catch_test_case_info.cpp // start catch_test_case_registry_impl.cpp +#include #include namespace Catch { - std::vector sortTests( IConfig const& config, std::vector const& unsortedTestCases ) { + namespace { + struct TestHasher { + using hash_t = uint64_t; + + explicit TestHasher( hash_t hashSuffix ): + m_hashSuffix{ hashSuffix } {} + + uint32_t operator()( TestCase const& t ) const { + // FNV-1a hash with multiplication fold. + const hash_t prime = 1099511628211u; + hash_t hash = 14695981039346656037u; + for ( const char c : t.name ) { + hash ^= c; + hash *= prime; + } + hash ^= m_hashSuffix; + hash *= prime; + const uint32_t low{ static_cast( hash ) }; + const uint32_t high{ static_cast( hash >> 32 ) }; + return low * high; + } - std::vector sorted = unsortedTestCases; + private: + hash_t m_hashSuffix; + }; + } // end unnamed namespace + std::vector sortTests( IConfig const& config, std::vector const& unsortedTestCases ) { switch( config.runOrder() ) { - case RunTests::InLexicographicalOrder: - std::sort( sorted.begin(), sorted.end() ); - break; - case RunTests::InRandomOrder: - seedRng( config ); - std::shuffle( sorted.begin(), sorted.end(), rng() ); - break; case RunTests::InDeclarationOrder: // already in declaration order break; + + case RunTests::InLexicographicalOrder: { + std::vector sorted = unsortedTestCases; + std::sort( sorted.begin(), sorted.end() ); + return sorted; + } + + case RunTests::InRandomOrder: { + seedRng( config ); + TestHasher h{ config.rngSeed() }; + + using hashedTest = std::pair; + std::vector indexed_tests; + indexed_tests.reserve( unsortedTestCases.size() ); + + for (auto const& testCase : unsortedTestCases) { + indexed_tests.emplace_back(h(testCase), &testCase); + } + + std::sort(indexed_tests.begin(), indexed_tests.end(), + [](hashedTest const& lhs, hashedTest const& rhs) { + if (lhs.first == rhs.first) { + return lhs.second->name < rhs.second->name; + } + return lhs.first < rhs.first; + }); + + std::vector sorted; + sorted.reserve( indexed_tests.size() ); + + for (auto const& hashed : indexed_tests) { + sorted.emplace_back(*hashed.second); + } + + return sorted; + } } - return sorted; + return unsortedTestCases; } bool isThrowSafe( TestCase const& testCase, IConfig const& config ) { @@ -14096,15 +14401,12 @@ namespace TestCaseTracking { m_currentTracker = tracker; } - TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ) - : m_nameAndLocation( nameAndLocation ), + TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent ): + ITracker(nameAndLocation), m_ctx( ctx ), m_parent( parent ) {} - NameAndLocation const& TrackerBase::nameAndLocation() const { - return m_nameAndLocation; - } bool TrackerBase::isComplete() const { return m_runState == CompletedSuccessfully || m_runState == Failed; } @@ -14220,7 +14522,8 @@ namespace TestCaseTracking { bool SectionTracker::isComplete() const { bool complete = true; - if ((m_filters.empty() || m_filters[0] == "") + if (m_filters.empty() + || m_filters[0] == "" || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) { complete = TrackerBase::isComplete(); } @@ -14255,8 +14558,8 @@ namespace TestCaseTracking { void SectionTracker::addInitialFilters( std::vector const& filters ) { if( !filters.empty() ) { m_filters.reserve( m_filters.size() + filters.size() + 2 ); - m_filters.push_back(""); // Root - should never be consulted - m_filters.push_back(""); // Test Case - not a section filter + m_filters.emplace_back(""); // Root - should never be consulted + m_filters.emplace_back(""); // Test Case - not a section filter m_filters.insert( m_filters.end(), filters.begin(), filters.end() ); } } @@ -14265,6 +14568,14 @@ namespace TestCaseTracking { m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() ); } + std::vector const& SectionTracker::getFilters() const { + return m_filters; + } + + std::string const& SectionTracker::trimmedName() const { + return m_trimmed_name; + } + } // namespace TestCaseTracking using TestCaseTracking::ITracker; @@ -14498,9 +14809,9 @@ namespace Catch { switch( m_mode ) { case Name: case QuotedName: - return addPattern(); + return addNamePattern(); case Tag: - return addPattern(); + return addTagPattern(); case EscapedName: revertBackToLastMode(); return; @@ -14553,6 +14864,7 @@ namespace Catch { m_pos = m_arg.size(); m_substring.clear(); m_patternName.clear(); + m_realPatternPos = 0; return false; } endMode(); @@ -14560,6 +14872,63 @@ namespace Catch { return true; //success } + std::string TestSpecParser::preprocessPattern() { + std::string token = m_patternName; + for (std::size_t i = 0; i < m_escapeChars.size(); ++i) + token = token.substr(0, m_escapeChars[i] - i) + token.substr(m_escapeChars[i] - i + 1); + m_escapeChars.clear(); + if (startsWith(token, "exclude:")) { + m_exclusion = true; + token = token.substr(8); + } + + m_patternName.clear(); + m_realPatternPos = 0; + + return token; + } + + void TestSpecParser::addNamePattern() { + auto token = preprocessPattern(); + + if (!token.empty()) { + TestSpec::PatternPtr pattern = std::make_shared(token, m_substring); + if (m_exclusion) + pattern = std::make_shared(pattern); + m_currentFilter.m_patterns.push_back(pattern); + } + m_substring.clear(); + m_exclusion = false; + m_mode = None; + } + + void TestSpecParser::addTagPattern() { + auto token = preprocessPattern(); + + if (!token.empty()) { + // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo]) + // we have to create a separate hide tag and shorten the real one + if (token.size() > 1 && token[0] == '.') { + token.erase(token.begin()); + TestSpec::PatternPtr pattern = std::make_shared(".", m_substring); + if (m_exclusion) { + pattern = std::make_shared(pattern); + } + m_currentFilter.m_patterns.push_back(pattern); + } + + TestSpec::PatternPtr pattern = std::make_shared(token, m_substring); + + if (m_exclusion) { + pattern = std::make_shared(pattern); + } + m_currentFilter.m_patterns.push_back(pattern); + } + m_substring.clear(); + m_exclusion = false; + m_mode = None; + } + TestSpec parseTestSpec( std::string const& arg ) { return TestSpecParser( ITagAliasRegistry::get() ).parse( arg ).testSpec(); } @@ -14661,13 +15030,11 @@ namespace Detail { enum Arch { Big, Little }; static Arch which() { - union _{ - int asInt; - char asChar[sizeof (int)]; - } u; - - u.asInt = 1; - return ( u.asChar[sizeof(int)-1] == 1 ) ? Big : Little; + int one = 1; + // If the lowest byte we read is non-zero, we can assume + // that little endian format is used. + auto value = *reinterpret_cast(&one); + return value ? Little : Big; } }; } @@ -14943,11 +15310,48 @@ namespace Catch { // end catch_totals.cpp // start catch_uncaught_exceptions.cpp +// start catch_config_uncaught_exceptions.hpp + +// Copyright Catch2 Authors +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// https://www.boost.org/LICENSE_1_0.txt) + +// SPDX-License-Identifier: BSL-1.0 + +#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP +#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP + +#if defined(_MSC_VER) +# if _MSC_VER >= 1900 // Visual Studio 2015 or newer +# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +# endif +#endif + +#include + +#if defined(__cpp_lib_uncaught_exceptions) \ + && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) + +# define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +#endif // __cpp_lib_uncaught_exceptions + +#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \ + && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \ + && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) + +# define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS +#endif + +#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP +// end catch_config_uncaught_exceptions.hpp #include namespace Catch { bool uncaught_exceptions() { -#if defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) +#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) + return false; +#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) return std::uncaught_exceptions() > 0; #else return std::uncaught_exception(); @@ -14987,7 +15391,7 @@ namespace Catch { } Version const& libraryVersion() { - static Version version( 2, 10, 1, "", 0 ); + static Version version( 2, 13, 9, "", 0 ); return version; } @@ -15035,8 +15439,7 @@ namespace Catch { // start catch_xmlwriter.cpp #include - -using uchar = unsigned char; +#include namespace Catch { @@ -15076,8 +15479,30 @@ namespace { os.flags(f); } + bool shouldNewline(XmlFormatting fmt) { + return !!(static_cast::type>(fmt & XmlFormatting::Newline)); + } + + bool shouldIndent(XmlFormatting fmt) { + return !!(static_cast::type>(fmt & XmlFormatting::Indent)); + } + } // anonymous namespace + XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs) { + return static_cast( + static_cast::type>(lhs) | + static_cast::type>(rhs) + ); + } + + XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs) { + return static_cast( + static_cast::type>(lhs) & + static_cast::type>(rhs) + ); + } + XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat ) : m_str( str ), m_forWhat( forWhat ) @@ -15088,7 +15513,7 @@ namespace { // (see: http://www.w3.org/TR/xml/#syntax) for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) { - uchar c = m_str[idx]; + unsigned char c = m_str[idx]; switch (c) { case '<': os << "<"; break; case '&': os << "&"; break; @@ -15148,7 +15573,7 @@ namespace { bool valid = true; uint32_t value = headerValue(c); for (std::size_t n = 1; n < encBytes; ++n) { - uchar nc = m_str[idx + n]; + unsigned char nc = m_str[idx + n]; valid &= ((nc & 0xC0) == 0x80); value = (value << 6) | (nc & 0x3F); } @@ -15182,13 +15607,17 @@ namespace { return os; } - XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer ) - : m_writer( writer ) + XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer, XmlFormatting fmt ) + : m_writer( writer ), + m_fmt(fmt) {} XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept - : m_writer( other.m_writer ){ + : m_writer( other.m_writer ), + m_fmt(other.m_fmt) + { other.m_writer = nullptr; + other.m_fmt = XmlFormatting::None; } XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept { if ( m_writer ) { @@ -15196,16 +15625,19 @@ namespace { } m_writer = other.m_writer; other.m_writer = nullptr; + m_fmt = other.m_fmt; + other.m_fmt = XmlFormatting::None; return *this; } XmlWriter::ScopedElement::~ScopedElement() { - if( m_writer ) - m_writer->endElement(); + if (m_writer) { + m_writer->endElement(m_fmt); + } } - XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, bool indent ) { - m_writer->writeText( text, indent ); + XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, XmlFormatting fmt ) { + m_writer->writeText( text, fmt ); return *this; } @@ -15215,37 +15647,47 @@ namespace { } XmlWriter::~XmlWriter() { - while( !m_tags.empty() ) + while (!m_tags.empty()) { endElement(); + } + newlineIfNecessary(); } - XmlWriter& XmlWriter::startElement( std::string const& name ) { + XmlWriter& XmlWriter::startElement( std::string const& name, XmlFormatting fmt ) { ensureTagClosed(); newlineIfNecessary(); - m_os << m_indent << '<' << name; + if (shouldIndent(fmt)) { + m_os << m_indent; + m_indent += " "; + } + m_os << '<' << name; m_tags.push_back( name ); - m_indent += " "; m_tagIsOpen = true; + applyFormatting(fmt); return *this; } - XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name ) { - ScopedElement scoped( this ); - startElement( name ); + XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name, XmlFormatting fmt ) { + ScopedElement scoped( this, fmt ); + startElement( name, fmt ); return scoped; } - XmlWriter& XmlWriter::endElement() { - newlineIfNecessary(); - m_indent = m_indent.substr( 0, m_indent.size()-2 ); + XmlWriter& XmlWriter::endElement(XmlFormatting fmt) { + m_indent = m_indent.substr(0, m_indent.size() - 2); + if( m_tagIsOpen ) { m_os << "/>"; m_tagIsOpen = false; + } else { + newlineIfNecessary(); + if (shouldIndent(fmt)) { + m_os << m_indent; + } + m_os << ""; } - else { - m_os << m_indent << ""; - } - m_os << std::endl; + m_os << std::flush; + applyFormatting(fmt); m_tags.pop_back(); return *this; } @@ -15261,22 +15703,26 @@ namespace { return *this; } - XmlWriter& XmlWriter::writeText( std::string const& text, bool indent ) { + XmlWriter& XmlWriter::writeText( std::string const& text, XmlFormatting fmt) { if( !text.empty() ){ bool tagWasOpen = m_tagIsOpen; ensureTagClosed(); - if( tagWasOpen && indent ) + if (tagWasOpen && shouldIndent(fmt)) { m_os << m_indent; + } m_os << XmlEncode( text ); - m_needsNewline = true; + applyFormatting(fmt); } return *this; } - XmlWriter& XmlWriter::writeComment( std::string const& text ) { + XmlWriter& XmlWriter::writeComment( std::string const& text, XmlFormatting fmt) { ensureTagClosed(); - m_os << m_indent << ""; - m_needsNewline = true; + if (shouldIndent(fmt)) { + m_os << m_indent; + } + m_os << ""; + applyFormatting(fmt); return *this; } @@ -15292,11 +15738,16 @@ namespace { void XmlWriter::ensureTagClosed() { if( m_tagIsOpen ) { - m_os << ">" << std::endl; + m_os << '>' << std::flush; + newlineIfNecessary(); m_tagIsOpen = false; } } + void XmlWriter::applyFormatting(XmlFormatting fmt) { + m_needsNewline = shouldNewline(fmt); + } + void XmlWriter::writeDeclaration() { m_os << "\n"; } @@ -15342,6 +15793,17 @@ namespace Catch { return std::string(buffer); } + bool shouldShowDuration( IConfig const& config, double duration ) { + if ( config.showDurations() == ShowDurations::Always ) { + return true; + } + if ( config.showDurations() == ShowDurations::Never ) { + return false; + } + const double min = config.minDuration(); + return min >= 0 && duration >= min; + } + std::string serializeFilters( std::vector const& container ) { ReusableStringStream oss; bool first = true; @@ -15608,10 +16070,6 @@ class AssertionPrinter { return "Reports test results on a single line, suitable for IDEs"; } - ReporterPreferences CompactReporter::getPreferences() const { - return m_reporterPrefs; - } - void CompactReporter::noMatchingTestCases( std::string const& spec ) { stream << "No test cases matched '" << spec << '\'' << std::endl; } @@ -15638,8 +16096,9 @@ class AssertionPrinter { } void CompactReporter::sectionEnded(SectionStats const& _sectionStats) { - if (m_config->showDurations() == ShowDurations::Always) { - stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl; + double dur = _sectionStats.durationInSeconds; + if ( shouldShowDuration( *m_config, dur ) ) { + stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << std::endl; } } @@ -15851,15 +16310,11 @@ class Duration { static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond; static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond; - uint64_t m_inNanoseconds; + double m_inNanoseconds; Unit m_units; public: - explicit Duration(double inNanoseconds, Unit units = Unit::Auto) - : Duration(static_cast(inNanoseconds), units) { - } - - explicit Duration(uint64_t inNanoseconds, Unit units = Unit::Auto) + explicit Duration(double inNanoseconds, Unit units = Unit::Auto) : m_inNanoseconds(inNanoseconds), m_units(units) { if (m_units == Unit::Auto) { @@ -15888,7 +16343,7 @@ class Duration { case Unit::Minutes: return m_inNanoseconds / static_cast(s_nanosecondsInAMinute); default: - return static_cast(m_inNanoseconds); + return m_inNanoseconds; } } auto unitsAsString() const -> std::string { @@ -16007,7 +16462,7 @@ ConsoleReporter::ConsoleReporter(ReporterConfig const& config) else { return{ - { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 32, ColumnInfo::Left }, + { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, ColumnInfo::Left }, { "samples mean std dev", 14, ColumnInfo::Right }, { "iterations low mean low std dev", 14, ColumnInfo::Right }, { "estimated high mean high std dev", 14, ColumnInfo::Right } @@ -16063,8 +16518,9 @@ void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) { stream << "\nNo assertions in test case"; stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl; } - if (m_config->showDurations() == ShowDurations::Always) { - stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl; + double dur = _sectionStats.durationInSeconds; + if (shouldShowDuration(*m_config, dur)) { + stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << std::endl; } if (m_headerPrinted) { m_headerPrinted = false; @@ -16324,8 +16780,10 @@ void ConsoleReporter::printSummaryDivider() { } void ConsoleReporter::printTestFilters() { - if (m_config->testSpec().hasFilters()) - stream << Colour(Colour::BrightYellow) << "Filters: " << serializeFilters( m_config->getTestsOrTags() ) << '\n'; + if (m_config->testSpec().hasFilters()) { + Colour guard(Colour::BrightYellow); + stream << "Filters: " << serializeFilters(m_config->getTestsOrTags()) << '\n'; + } } CATCH_REGISTER_REPORTER("console", ConsoleReporter) @@ -16346,6 +16804,7 @@ CATCH_REGISTER_REPORTER("console", ConsoleReporter) #include #include #include +#include namespace Catch { @@ -16373,7 +16832,7 @@ namespace Catch { #else std::strftime(timeStamp, timeStampSize, fmt, timeInfo); #endif - return std::string(timeStamp); + return std::string(timeStamp, timeStampSize-1); } std::string fileNameTag(const std::vector &tags) { @@ -16384,6 +16843,17 @@ namespace Catch { return it->substr(1); return std::string(); } + + // Formats the duration in seconds to 3 decimal places. + // This is done because some genius defined Maven Surefire schema + // in a way that only accepts 3 decimal places, and tools like + // Jenkins use that schema for validation JUnit reporter output. + std::string formatDuration( double seconds ) { + ReusableStringStream rss; + rss << std::fixed << std::setprecision( 3 ) << seconds; + return rss.str(); + } + } // anonymous namespace JunitReporter::JunitReporter( ReporterConfig const& _config ) @@ -16453,7 +16923,7 @@ namespace Catch { if( m_config->showDurations() == ShowDurations::Never ) xml.writeAttribute( "time", "" ); else - xml.writeAttribute( "time", suiteTime ); + xml.writeAttribute( "time", formatDuration( suiteTime ) ); xml.writeAttribute( "timestamp", getCurrentTimestamp() ); // Write properties if there are any @@ -16475,8 +16945,8 @@ namespace Catch { for( auto const& child : groupNode.children ) writeTestCase( *child ); - xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), false ); - xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), false ); + xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), XmlFormatting::Newline ); + xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), XmlFormatting::Newline ); } void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) { @@ -16498,12 +16968,13 @@ namespace Catch { if ( !m_config->name().empty() ) className = m_config->name() + "." + className; - writeSection( className, "", rootSection ); + writeSection( className, "", rootSection, stats.testInfo.okToFail() ); } - void JunitReporter::writeSection( std::string const& className, - std::string const& rootName, - SectionNode const& sectionNode ) { + void JunitReporter::writeSection( std::string const& className, + std::string const& rootName, + SectionNode const& sectionNode, + bool testOkToFail) { std::string name = trim( sectionNode.stats.sectionInfo.name ); if( !rootName.empty() ) name = rootName + '/' + name; @@ -16520,20 +16991,30 @@ namespace Catch { xml.writeAttribute( "classname", className ); xml.writeAttribute( "name", name ); } - xml.writeAttribute( "time", ::Catch::Detail::stringify( sectionNode.stats.durationInSeconds ) ); + xml.writeAttribute( "time", formatDuration( sectionNode.stats.durationInSeconds ) ); + // This is not ideal, but it should be enough to mimic gtest's + // junit output. + // Ideally the JUnit reporter would also handle `skipTest` + // events and write those out appropriately. + xml.writeAttribute( "status", "run" ); + + if (sectionNode.stats.assertions.failedButOk) { + xml.scopedElement("skipped") + .writeAttribute("message", "TEST_CASE tagged with !mayfail"); + } writeAssertions( sectionNode ); if( !sectionNode.stdOut.empty() ) - xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), false ); + xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), XmlFormatting::Newline ); if( !sectionNode.stdErr.empty() ) - xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), false ); + xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), XmlFormatting::Newline ); } for( auto const& childNode : sectionNode.childSections ) if( className.empty() ) - writeSection( name, "", *childNode ); + writeSection( name, "", *childNode, testOkToFail ); else - writeSection( className, name, *childNode ); + writeSection( className, name, *childNode, testOkToFail ); } void JunitReporter::writeAssertions( SectionNode const& sectionNode ) { @@ -16551,11 +17032,7 @@ namespace Catch { elementName = "error"; break; case ResultWas::ExplicitFailure: - elementName = "failure"; - break; case ResultWas::ExpressionFailed: - elementName = "failure"; - break; case ResultWas::DidntThrowException: elementName = "failure"; break; @@ -16573,10 +17050,25 @@ namespace Catch { XmlWriter::ScopedElement e = xml.scopedElement( elementName ); - xml.writeAttribute( "message", result.getExpandedExpression() ); + xml.writeAttribute( "message", result.getExpression() ); xml.writeAttribute( "type", result.getTestMacroName() ); ReusableStringStream rss; + if (stats.totals.assertions.total() > 0) { + rss << "FAILED" << ":\n"; + if (result.hasExpression()) { + rss << " "; + rss << result.getExpressionInMacro(); + rss << '\n'; + } + if (result.hasExpandedExpression()) { + rss << "with expansion:\n"; + rss << Column(result.getExpandedExpression()).indent(2) << '\n'; + } + } else { + rss << '\n'; + } + if( !result.getMessage().empty() ) rss << result.getMessage() << '\n'; for( auto const& msg : stats.infoMessages ) @@ -16584,7 +17076,7 @@ namespace Catch { rss << msg.message << '\n'; rss << "at " << result.getSourceInfo(); - xml.writeText( rss.str(), false ); + xml.writeText( rss.str(), XmlFormatting::Newline ); } } @@ -16930,9 +17422,9 @@ namespace Catch { e.writeAttribute( "durationInSeconds", m_testCaseTimer.getElapsedSeconds() ); if( !testCaseStats.stdOut.empty() ) - m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), false ); + m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), XmlFormatting::Newline ); if( !testCaseStats.stdErr.empty() ) - m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), false ); + m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), XmlFormatting::Newline ); m_xml.endElement(); } @@ -16944,6 +17436,10 @@ namespace Catch { .writeAttribute( "successes", testGroupStats.totals.assertions.passed ) .writeAttribute( "failures", testGroupStats.totals.assertions.failed ) .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk ); + m_xml.scopedElement( "OverallResultsCases") + .writeAttribute( "successes", testGroupStats.totals.testCases.passed ) + .writeAttribute( "failures", testGroupStats.totals.testCases.failed ) + .writeAttribute( "expectedFailures", testGroupStats.totals.testCases.failedButOk ); m_xml.endElement(); } @@ -16953,6 +17449,10 @@ namespace Catch { .writeAttribute( "successes", testRunStats.totals.assertions.passed ) .writeAttribute( "failures", testRunStats.totals.assertions.failed ) .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk ); + m_xml.scopedElement( "OverallResultsCases") + .writeAttribute( "successes", testRunStats.totals.testCases.passed ) + .writeAttribute( "failures", testRunStats.totals.testCases.failed ) + .writeAttribute( "expectedFailures", testRunStats.totals.testCases.failedButOk ); m_xml.endElement(); } @@ -16966,16 +17466,16 @@ namespace Catch { m_xml.writeAttribute("samples", info.samples) .writeAttribute("resamples", info.resamples) .writeAttribute("iterations", info.iterations) - .writeAttribute("clockResolution", static_cast(info.clockResolution)) - .writeAttribute("estimatedDuration", static_cast(info.estimatedDuration)) + .writeAttribute("clockResolution", info.clockResolution) + .writeAttribute("estimatedDuration", info.estimatedDuration) .writeComment("All values in nano seconds"); } void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) { m_xml.startElement("mean") - .writeAttribute("value", static_cast(benchmarkStats.mean.point.count())) - .writeAttribute("lowerBound", static_cast(benchmarkStats.mean.lower_bound.count())) - .writeAttribute("upperBound", static_cast(benchmarkStats.mean.upper_bound.count())) + .writeAttribute("value", benchmarkStats.mean.point.count()) + .writeAttribute("lowerBound", benchmarkStats.mean.lower_bound.count()) + .writeAttribute("upperBound", benchmarkStats.mean.upper_bound.count()) .writeAttribute("ci", benchmarkStats.mean.confidence_interval); m_xml.endElement(); m_xml.startElement("standardDeviation") @@ -17026,7 +17526,7 @@ namespace Catch { #ifndef __OBJC__ -#if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN) +#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN) // Standard C/C++ Win32 Unicode wmain entry point extern "C" int wmain (int argc, wchar_t * argv[], wchar_t * []) { #else @@ -17159,9 +17659,9 @@ int main (int argc, char * const argv[]) { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) #define CATCH_BENCHMARK(...) \ - INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) + INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) #define CATCH_BENCHMARK_ADVANCED(name) \ - INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name) + INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), name) #endif // CATCH_CONFIG_ENABLE_BENCHMARKING // If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required @@ -17263,9 +17763,9 @@ int main (int argc, char * const argv[]) { #if defined(CATCH_CONFIG_ENABLE_BENCHMARKING) #define BENCHMARK(...) \ - INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) + INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,)) #define BENCHMARK_ADVANCED(name) \ - INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name) + INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(C_A_T_C_H_B_E_N_C_H_), name) #endif // CATCH_CONFIG_ENABLE_BENCHMARKING using Catch::Detail::Approx; @@ -17312,8 +17812,8 @@ using Catch::Detail::Approx; #define CATCH_WARN( msg ) (void)(0) #define CATCH_CAPTURE( msg ) (void)(0) -#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #define CATCH_METHOD_AS_TEST_CASE( method, ... ) #define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0) #define CATCH_SECTION( ... ) @@ -17322,7 +17822,7 @@ using Catch::Detail::Approx; #define CATCH_FAIL_CHECK( ... ) (void)(0) #define CATCH_SUCCEED( ... ) (void)(0) -#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) @@ -17345,8 +17845,8 @@ using Catch::Detail::Approx; #endif // "BDD-style" convenience wrappers -#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className ) +#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), className ) #define CATCH_GIVEN( desc ) #define CATCH_AND_GIVEN( desc ) #define CATCH_WHEN( desc ) @@ -17394,10 +17894,10 @@ using Catch::Detail::Approx; #define INFO( msg ) (void)(0) #define UNSCOPED_INFO( msg ) (void)(0) #define WARN( msg ) (void)(0) -#define CAPTURE( msg ) (void)(0) +#define CAPTURE( ... ) (void)(0) -#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) -#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) +#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #define METHOD_AS_TEST_CASE( method, ... ) #define REGISTER_TEST_CASE( Function, ... ) (void)(0) #define SECTION( ... ) @@ -17405,7 +17905,7 @@ using Catch::Detail::Approx; #define FAIL( ... ) (void)(0) #define FAIL_CHECK( ... ) (void)(0) #define SUCCEED( ... ) (void)(0) -#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ )) +#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ )) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) @@ -17435,8 +17935,8 @@ using Catch::Detail::Approx; #define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature ) // "BDD-style" convenience wrappers -#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ) ) -#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className ) +#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ) ) +#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( C_A_T_C_H_T_E_S_T_ ), className ) #define GIVEN( desc ) #define AND_GIVEN( desc ) @@ -17467,4 +17967,3 @@ using Catch::Detail::Approx; // end catch_reenable_warnings.h // end catch.hpp #endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED - diff --git a/src/3rd_party/intgemm b/src/3rd_party/intgemm index a05a2e51a..0eda93a95 160000 --- a/src/3rd_party/intgemm +++ b/src/3rd_party/intgemm @@ -1 +1 @@ -Subproject commit a05a2e51ab524bcee954a39ee72005193f3adf7c +Subproject commit 0eda93a95a4472af0a50c78b5df58e7fc459ac7a diff --git a/src/3rd_party/simple-websocket-server b/src/3rd_party/simple-websocket-server index 1d7e84aeb..8909c57b5 160000 --- a/src/3rd_party/simple-websocket-server +++ b/src/3rd_party/simple-websocket-server @@ -1 +1 @@ -Subproject commit 1d7e84aeb3f1ebdc78f6965d79ad3ca3003789fe +Subproject commit 8909c57b5473cb95e197fa7f034edabb474535ba diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index b0230da99..6c6b002aa 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -54,8 +54,10 @@ void ConfigValidator::validateOptionsTranslation() const { ABORT_IF(models.empty() && configs.empty(), "You need to provide at least one model file or a config file"); +#ifdef COMPILE_CPU ABORT_IF(get("model-mmap") && get("cpu-threads") == 0, "Model MMAP is CPU-only, please use --cpu-threads"); +#endif for(const auto& modelFile : models) { filesystem::Path modelPath(modelFile); diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index e05f31225..caee2e0c3 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -130,7 +130,7 @@ namespace marian { // @TODO: add checks for empty factor groups until it stops crashing (training already works; decoder still crashes) io::InputFileStream in(modelPath); - for (WordIndex v = 0; io::getline(in, line); v++) { + for(; io::getline(in, line);) { utils::splitAny(line, tokBuf, " \t"); factorMapTokenized.push_back(tokBuf); } diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 82b0df69a..bf185d570 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -221,7 +221,6 @@ class LexicalShortlistGenerator : public ShortlistGenerator { } void prune(float threshold = 0.f) { - size_t i = 0; for(auto& probs : data_) { std::vector> sorter; for(auto& it : probs) @@ -237,8 +236,6 @@ class LexicalShortlistGenerator : public ShortlistGenerator { else break; } - - ++i; } } From 4d3702c4ec3d3fd8efd1c40a8746f39f446d9980 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 6 Oct 2022 05:53:16 +0000 Subject: [PATCH 203/254] Merged PR 25950: Add missing defaults for concatenated factors This PR adds missing default values for concatenated factors. --- CHANGELOG.md | 1 + VERSION | 2 +- src/layers/embedding.cpp | 11 +++++------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f93148e87..c46df0f25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) ### Fixed +- Make concat factors not break old vector implementation - Use allocator in hashing - Read/restore checkpoints from main process only when training with MPI - Multi-loss casts type to first loss-type before accumulation (aborted before due to missing cast) diff --git a/VERSION b/VERSION index 9ec465949..daf48f91d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.13 +v1.11.14 diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index d6768fdbf..b60f6cc18 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -6,10 +6,8 @@ namespace marian { Embedding::Embedding(Ptr graph, Ptr options) : LayerBase(graph, options), inference_(opt("inference")) { std::string name = opt("prefix"); - int dimVoc = opt("dimVocab"); - int dimEmb = opt("dimEmb"); - int dimFactorEmb = opt("dimFactorEmb"); - + int dimVoc = opt("dimVocab"); + int dimEmb = opt("dimEmb"); bool fixed = opt("fixed", false); // Embedding layer initialization should depend only on embedding size, hence fanIn=false @@ -21,6 +19,7 @@ Embedding::Embedding(Ptr graph, Ptr options) dimVoc = (int)factoredVocab_->factorVocabSize(); LOG_ONCE(info, "[embedding] Factored embeddings enabled"); if(opt("factorsCombine") == "concat") { + int dimFactorEmb = opt("dimFactorEmb", 0); ABORT_IF(dimFactorEmb == 0, "Embedding: If concatenation is chosen to combine the factor embeddings, a factor " "embedding size must be specified."); @@ -179,8 +178,8 @@ Expr Embedding::applyIndices(const std::vector& embIdx, const Shape& "prefix", (opt("tied-embeddings-src") || opt("tied-embeddings-all")) ? "Wemb" : prefix_ + "_Wemb", "fixed", embeddingFix_, - "dimFactorEmb", opt("factors-dim-emb"), // for factored embeddings - "factorsCombine", opt("factors-combine"), // for factored embeddings + "dimFactorEmb", opt("factors-dim-emb", 0), // for factored embeddings + "factorsCombine", opt("factors-combine", ""), // for factored embeddings "vocab", opt>("vocabs")[batchIndex_]); // for factored embeddings // clang-format on if(options_->hasAndNotEmpty("embedding-vectors")) { From a6de1b781c61f129f77fabf5f75a9a3b60c55913 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 1 Nov 2022 06:26:56 +0000 Subject: [PATCH 204/254] Merged PR 26271: Update CI pipeline triggers Updates to the CI triggers: - Stop running parallel CI runs, i.e. if a pipeline is running, it must finish before new runs are started. - Exclude paths to files, which are not related to/critical the codebase - Downloading MKL from a mirror hosting server --- azure-pipelines.yml | 31 +++++++++++++++++++++++++++---- azure-regression-tests.yml | 3 +++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0c7bd9c72..5a989db43 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -16,7 +16,24 @@ parameters: # The pipeline CI trigger is set on the branch master only and PR trigger on a # (non-draft) pull request to any branch trigger: -- master + # This minimizes the number of parallel pipeline runs. When a pipeline is + # running, the CI waits until it is completed before starting another one. + batch: true + branches: + include: + - master + paths: + exclude: + - azure-regression-tests.yml + - contrib + - doc + - examples + - regression-tests + - scripts + - VERSION + - vs + - '**/*.md' + - '**/*.txt' pool: name: Azure Pipelines @@ -32,7 +49,7 @@ variables: - name: MKL_DIR value: "$(Build.SourcesDirectory)/mkl" - name: MKL_URL - value: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" + value: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip" - name: VCPKG_COMMIT value: 2022.03.10 - name: VCPKG_DIR @@ -52,6 +69,7 @@ stages: ###################################################################### - job: BuildWindows + cancelTimeoutInMinutes: 1 condition: eq(${{ parameters.runBuilds }}, true) displayName: Windows @@ -188,6 +206,7 @@ stages: ###################################################################### - job: BuildUbuntu + cancelTimeoutInMinutes: 1 condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu timeoutInMinutes: 120 @@ -324,6 +343,7 @@ stages: ###################################################################### - job: BuildMacOS + cancelTimeoutInMinutes: 1 condition: eq(${{ parameters.runBuilds }}, true) displayName: macOS CPU clang @@ -373,6 +393,7 @@ stages: ###################################################################### - job: BuildInstall + cancelTimeoutInMinutes: 1 condition: eq(${{ parameters.runBuilds }}, true) displayName: Linux CPU library install @@ -435,6 +456,7 @@ stages: ###################################################################### - job: TestWindows + cancelTimeoutInMinutes: 1 displayName: Windows CPU+FBGEMM pool: @@ -535,7 +557,7 @@ stages: ls displayName: Prepare tests env: - AWS_SECRET_SAS_TOKEN: $(blob-sas-token) + SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' @@ -560,6 +582,7 @@ stages: ###################################################################### - job: TestLinux + cancelTimeoutInMinutes: 1 displayName: Linux CPU+FBGEMM pool: @@ -636,7 +659,7 @@ stages: ls displayName: Prepare tests env: - AWS_SECRET_SAS_TOKEN: $(blob-sas-token) + AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops' diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index c849b59df..a56c9dce5 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -20,6 +20,7 @@ stages: ###################################################################### - job: TestsGPULinux + cancelTimeoutInMinutes: 1 displayName: Linux GPU tests timeoutInMinutes: 120 @@ -108,6 +109,8 @@ stages: git pull origin master make install displayName: Prepare regression tests + env: + AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) workingDirectory: regression-tests # Continue on error to be able to collect outputs and publish them as an artifact From be1ee3fa944c6e49f61d0e5e1153333b49dd936a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 1 Nov 2022 10:07:40 +0000 Subject: [PATCH 205/254] Merged PR 26318: Fix incorrect envvar name in Azure Pipeline Fix incorrect environment variable name for SAS token in Windows tests --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5a989db43..bef33aec2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -557,7 +557,7 @@ stages: ls displayName: Prepare tests env: - SAS_TOKEN: $(blob-sas-token) + AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' From cda2f2112fb7e482a3eb75c95aaa4111c8883d61 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Wed, 2 Nov 2022 11:09:43 +0000 Subject: [PATCH 206/254] Temporarily download MKL tarball from a mirror server (#972) --- .github/workflows/release.yml | 2 +- .github/workflows/windows.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8a3761e3b..5beab28f0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -158,7 +158,7 @@ jobs: - name: Download MKL run: | - C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip + C:\msys64\usr\bin\wget.exe -nv https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip -O mkl.zip Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl # Set the MKLROOT environment variable so that CMake can find MKL. # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index ee85f303d..b1d6b1bd1 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -7,7 +7,7 @@ on: branches: [ master ] env: - MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" + MKL_URL: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip" BOOST_ROOT: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64" BOOST_URL: "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe" From 07a2ac8126a848852007097c428da76c1238e4d0 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Wed, 2 Nov 2022 11:16:14 +0000 Subject: [PATCH 207/254] best-deep alias broken (#968) The best-deep alias in marian is currently broken, because it doesn't set the model type and the default is `amum` which is incompatible with multiple layers. This commit just adds the type to the best-deep alias entry. --- CHANGELOG.md | 1 + src/common/aliases.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f93148e87..ca943b8c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed fp16 training/inference with factors-combine concat method - Fixed clang 13.0.1 compatibility - Fixed potential vulnerabilities from lxml<4.9.1 or mistune<2.0.31 +- Fixed the `--best-deep` RNN alias not setting the s2s model type ### Changed - Parameter synchronization in local sharding model now executes hash checksum before syncing diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 3db31e515..75d9bdf97 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -46,6 +46,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { // Options setting the BiDeep architecture proposed in http://www.aclweb.org/anthology/W17-4710 cli.alias("best-deep", "true", [](YAML::Node& config) { + config["type"] = "s2s"; config["layer-normalization"] = true; config["tied-embeddings"] = true; config["enc-type"] = "alternating"; From 4187aab9baedeb0f9f9d3cce9a531b95b1c1d357 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 19 Nov 2022 08:35:45 +0000 Subject: [PATCH 208/254] Bump regression-tests from `92e116e` to `494d6de` (#973) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `92e116e` to `494d6de`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/92e116efa369d6ed848c8eb19dfcef8bf7245d71...494d6de2bd0d745cd8eaf8614d75fe36d01b5519) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 92e116efa..494d6de2b 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 92e116efa369d6ed848c8eb19dfcef8bf7245d71 +Subproject commit 494d6de2bd0d745cd8eaf8614d75fe36d01b5519 From 36349645b8e0dc77d87ab921991d4386e8a0d571 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 19 Nov 2022 08:36:06 +0000 Subject: [PATCH 209/254] Bump src/3rd_party/sentencepiece from `31ac8e8` to `8dc9172` (#970) Bumps [src/3rd_party/sentencepiece](https://github.com/marian-nmt/sentencepiece) from `31ac8e8` to `8dc9172`. - [Release notes](https://github.com/marian-nmt/sentencepiece/releases) - [Commits](https://github.com/marian-nmt/sentencepiece/compare/31ac8e88760f48d31843eeed36136458df0f60aa...8dc9172f88b1d4054ca38de0e5362b2935e9b53f) --- updated-dependencies: - dependency-name: src/3rd_party/sentencepiece dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 31ac8e887..8dc9172f8 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 31ac8e88760f48d31843eeed36136458df0f60aa +Subproject commit 8dc9172f88b1d4054ca38de0e5362b2935e9b53f From c79dc80a2fc114b083e01387f3503da680a3541a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Sun, 20 Nov 2022 13:31:10 +0000 Subject: [PATCH 210/254] Merged PR 26617: Update regression-tests & fix CI pipelines Update regression-tests & fix CI pipelines --- azure-pipelines.yml | 10 +++++----- azure-regression-tests.yml | 6 +++++- regression-tests | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bef33aec2..faa619006 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -39,7 +39,7 @@ pool: name: Azure Pipelines variables: - - group: marian-prod-tests + - group: marian-regression-tests - name: BOOST_ROOT_WINDOWS value: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64" - name: BOOST_URL @@ -550,14 +550,14 @@ stages: displayName: Machine statistics workingDirectory: marian-prod-tests - # The current SAS token will expire on 8/30/2023 and a new one will need to be set in Marian > Pipelines > Library + # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library - bash: | cd models bash download-models.sh ls displayName: Prepare tests env: - AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) + AZURE_STORAGE_SAS_TOKEN: $(marian-prod-tests-blob-sas-token) workingDirectory: marian-prod-tests # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' @@ -652,14 +652,14 @@ stages: displayName: Machine statistics workingDirectory: marian-prod-tests - # The current SAS token will expire on 8/30/2023 and a new one will need to be set in Marian > Pipelines > Library + # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library - bash: | cd models bash download-models.sh ls displayName: Prepare tests env: - AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) + AZURE_STORAGE_SAS_TOKEN: $(marian-prod-tests-blob-sas-token) workingDirectory: marian-prod-tests - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops' diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index a56c9dce5..cb3730c19 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -14,6 +14,9 @@ trigger: none # Hosted Azure DevOps Pool determining OS, CUDA version and available GPUs pool: mariandevops-pool-m60-eus +variables: + - group: marian-regression-tests + stages: - stage: TestsGPU jobs: @@ -104,13 +107,14 @@ stages: workingDirectory: build # Always run regression tests from the master branch + # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library - bash: | git checkout master git pull origin master make install displayName: Prepare regression tests env: - AZURE_STORAGE_SAS_TOKEN: $(blob-sas-token) + AZURE_STORAGE_SAS_TOKEN: $(marian-pub-tests-blob-sas-token) workingDirectory: regression-tests # Continue on error to be able to collect outputs and publish them as an artifact diff --git a/regression-tests b/regression-tests index 92e116efa..488d454a0 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 92e116efa369d6ed848c8eb19dfcef8bf7245d71 +Subproject commit 488d454a0177ef300eab91ab813e485d420dc38d From b6581c4c44147f92ad8febe0e331bd68a8bda23f Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Wed, 23 Nov 2022 19:16:44 +0000 Subject: [PATCH 211/254] Merged PR 26667: Update examples submodule to fix vulnerability issues Updating examples submodule using [protobuf 3.20.2](https://github.com/marian-nmt/marian-examples/pull/29) to fix recent [vulnerability issues](https://machinetranslation.visualstudio.com/MachineTranslation/_componentGovernance/mtmain/alert/8035094?typeId=14698327&pipelinesTrackingFilter=0). Related work items: #134319 --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 25e843832..58f48a067 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 25e84383225a29f769e362250654ddf256d06261 +Subproject commit 58f48a06756c623fe799613134810322e061863f From d5569cee606e2d1b62b4e62bbd00f03396d23b78 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Nov 2022 18:22:00 +0000 Subject: [PATCH 212/254] Bump regression-tests from `494d6de` to `488d454` (#974) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `494d6de` to `488d454`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/494d6de2bd0d745cd8eaf8614d75fe36d01b5519...488d454a0177ef300eab91ab813e485d420dc38d) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 494d6de2b..488d454a0 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 494d6de2bd0d745cd8eaf8614d75fe36d01b5519 +Subproject commit 488d454a0177ef300eab91ab813e485d420dc38d From 3c2a432995e58ff68e00d9764147b2fb23255aa0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Nov 2022 18:22:22 +0000 Subject: [PATCH 213/254] Bump examples from `25e8438` to `58f48a0` (#975) Bumps [examples](https://github.com/marian-nmt/marian-examples) from `25e8438` to `58f48a0`. - [Release notes](https://github.com/marian-nmt/marian-examples/releases) - [Commits](https://github.com/marian-nmt/marian-examples/compare/25e84383225a29f769e362250654ddf256d06261...58f48a06756c623fe799613134810322e061863f) --- updated-dependencies: - dependency-name: examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 25e843832..58f48a067 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 25e84383225a29f769e362250654ddf256d06261 +Subproject commit 58f48a06756c623fe799613134810322e061863f From b7205fc0b029efe5ea62fd3e35ac2a2227ea641f Mon Sep 17 00:00:00 2001 From: Alex Muzio Date: Wed, 30 Nov 2022 12:23:38 +0000 Subject: [PATCH 214/254] Merged PR 25220: Add extra model information to model_info.py script Adding model shapes flag to model_info.py script: dtype and total number of model parameters. Example: `python model_info.py -m ~/model.npz -mi` --- regression-tests | 2 +- scripts/contrib/model_info.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/regression-tests b/regression-tests index 488d454a0..2a8bed3f0 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 488d454a0177ef300eab91ab813e485d420dc38d +Subproject commit 2a8bed3f0e937a9de2d6fa92dee3bcf482d3d47b diff --git a/scripts/contrib/model_info.py b/scripts/contrib/model_info.py index 3c5730844..9e2a02631 100755 --- a/scripts/contrib/model_info.py +++ b/scripts/contrib/model_info.py @@ -42,11 +42,15 @@ def main(): else: print(model[args.key]) else: + total_nb_of_parameters = 0 for key in model: - if args.matrix_shapes: - print(key, model[key].shape) + if not key == S2S_SPECIAL_NODE: + total_nb_of_parameters += np.prod(model[key].shape) + if args.matrix_info: + print(key, model[key].shape, model[key].dtype) else: print(key) + print('Total number of parameters:', total_nb_of_parameters) def parse_args(): @@ -57,8 +61,8 @@ def parse_args(): help="print values from special:model.yml node") parser.add_argument("-f", "--full-matrix", action="store_true", help="force numpy to print full arrays for single key") - parser.add_argument("-ms", "--matrix-shapes", action="store_true", - help="print shapes of all arrays in the model") + parser.add_argument("-mi", "--matrix-info", action="store_true", + help="print full matrix info for all keys. Includes shape and dtype") return parser.parse_args() From ee50d4aaeabbec3a82628d0804b0e078b04b84d4 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 20 Dec 2022 17:56:10 +0000 Subject: [PATCH 215/254] Merged PR 27051: Add an option for completely resetting validation metrics Added `--valid-reset-all` that works as `--valid-reset-stalled` but it also resets last best saved validation metrics, which is useful for when the validation sets change for continued training. Added new regression test: https://github.com/marian-nmt/marian-regression-tests/pull/89 --- CHANGELOG.md | 1 + VERSION | 2 +- azure-pipelines.yml | 5 ++++- src/common/config_parser.cpp | 6 ++++-- src/training/scheduler.h | 13 +++++++++---- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c46df0f25..53f81397d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fused inplace-dropout in FFN layer in Transformer - `--force-decode` option for marian-decoder - `--output-sampling` now works with ensembles (requires proper normalization via e.g `--weights 0.5 0.5`) +- `--valid-reset-all` option ### Fixed - Make concat factors not break old vector implementation diff --git a/VERSION b/VERSION index daf48f91d..2eac760f5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.14 +v1.11.15 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index faa619006..3b1bfff3f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -595,7 +595,10 @@ stages: # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev - - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-9 g++-9 + # Installing libunwind-dev fixes a bug in 2204 (the libunwind-14 and libunwind-dev conflict) + - bash: | + sudo apt-get install -y libunwind-dev + sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler gcc-9 g++-9 displayName: Install packages # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index c9ab45f81..4cc23f2ca 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -269,7 +269,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", - 2048); + 2048); cli.add("--transformer-decoder-dim-ffn", "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", 0); @@ -591,7 +591,9 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { "Multiple metrics can be specified", {"cross-entropy"}); cli.add("--valid-reset-stalled", - "Reset all stalled validation metrics when the training is restarted"); + "Reset stalled validation metrics when the training is restarted"); + cli.add("--valid-reset-all", + "Reset all validation metrics when the training is restarted"); cli.add("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 34aa18c21..30f8c8de7 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -494,12 +494,17 @@ class Scheduler : public TrainingObserver { state_->wordsDisp = 0; } - if(options_->get("valid-reset-stalled")) { + if(options_->get("valid-reset-stalled") || options_->get("valid-reset-all")) { state_->stalled = 0; state_->maxStalled = 0; for(const auto& validator : validators_) { - if(state_->validators[validator->type()]) + if(state_->validators[validator->type()]) { + // reset the number of stalled validations, e.g. when the validation set is the same state_->validators[validator->type()]["stalled"] = 0; + // reset last best results as well, e.g. when the validation set changes + if(options_->get("valid-reset-all")) + state_->validators[validator->type()]["last-best"] = validator->initScore(); + } } } @@ -512,10 +517,10 @@ class Scheduler : public TrainingObserver { if(mpi_->isMainProcess()) if(filesystem::exists(nameYaml)) yamlStr = io::InputFileStream(nameYaml).readToString(); - + if(mpi_) mpi_->bCast(yamlStr); - + loadFromString(yamlStr); } From 4f145c450f2b4b956d175fbbfe118a90e494acf4 Mon Sep 17 00:00:00 2001 From: Varun Mathur Date: Fri, 10 Feb 2023 16:34:37 +0000 Subject: [PATCH 216/254] Merged PR 26311: [FSM] make model loading lock non-static make lock non-static --- src/data/factored_vocab.cpp | 3 +-- src/data/factored_vocab.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index caee2e0c3..f51869d56 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -21,8 +21,7 @@ namespace marian { maxSizeUnused; // If model has already been loaded, then assume this is a shared object, and skip loading it again. // This can be multi-threaded, so must run under lock. - static std::mutex s_mtx; - std::lock_guard criticalSection(s_mtx); + std::lock_guard criticalSection(loadMtx_); if (size() != 0) { //LOG(info, "[vocab] Attempting to load model a second time; skipping (assuming shared vocab)"); return size(); diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h index b644ce4c4..edbee1544 100644 --- a/src/data/factored_vocab.h +++ b/src/data/factored_vocab.h @@ -110,6 +110,7 @@ class FactoredVocab : public IVocab { Word unkId_{}; WordLUT vocab_; size_t lemmaSize_; + std::mutex loadMtx_; // factors char factorSeparator_ = '|'; // separator symbol for parsing factored words From 9ad5203ca228fb63cba5147ebf566849945a4919 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 11 Feb 2023 16:35:29 +0000 Subject: [PATCH 217/254] Merged PR 26476: Sanitize guided-alignment with case-augmentation (still somewhat wonky) This fixes the blow-ups of using case-augmentation with guided-alignment. However, it's still not recommended to use this particular combination, results will be unreliable. --- src/data/corpus.cpp | 2 +- src/data/corpus_base.cpp | 5 +++- src/data/corpus_base.h | 8 +++++-- src/graph/expression_graph.cpp | 8 +++++++ src/graph/expression_graph.h | 6 +++++ src/graph/expression_operators.cpp | 5 ++++ src/graph/expression_operators.h | 5 ++++ src/layers/guided_alignment.h | 38 +++++++++++++++++------------- 8 files changed, 57 insertions(+), 20 deletions(-) diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 835d9d76c..b36d42ac2 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -128,7 +128,7 @@ SentenceTuple Corpus::next() { size_t vocabId = i - shift; bool altered; preprocessLine(fields[i], vocabId, curId, /*out=*/altered); - if (altered) + if(altered) tup.markAltered(); addWordsToSentenceTuple(fields[i], vocabId, tup); } diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index addcc3bfa..d276ca6bc 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -476,7 +476,10 @@ void CorpusBase::addAlignmentsToBatch(Ptr batch, // If the batch vector is altered within marian by, for example, case augmentation, // the guided alignments we received for this tuple cease to be valid. // Hence skip setting alignments for that sentence tuple.. - if (!batchVector[b].isAltered()) { + if (batchVector[b].isAltered()) { + LOG_ONCE(info, "Using guided-alignment with case-augmentation is not recommended and can result in unexpected behavior"); + aligns.push_back(WordAlignment()); + } else { aligns.push_back(std::move(batchVector[b].getAlignment())); } } diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 4e6d923ee..2e572ebd8 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -56,12 +56,16 @@ class SentenceTupleImpl { * @brief Returns whether this Tuple was altered or augmented from what * was provided to Marian in input. */ - bool isAltered() const { return altered_; } + bool isAltered() const { + return altered_; + } /** * @brief Mark that this Tuple was internally altered or augmented by Marian */ - void markAltered() { altered_ = true; } + void markAltered() { + altered_ = true; + } /** * @brief Adds a new sentence at the end of the tuple. diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 146f7c4ca..9e90b5413 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -64,6 +64,14 @@ Expr ExpressionGraph::add(Expr node) { } } +/** + * Removes the node from the set of roots (will not be initialized during back propagation) + * @param node a pointer to a expression node + */ +void ExpressionGraph::removeAsRoot(Expr node) { + topNodes_.erase(node); +} + // Call on every checkpoint in backwards order void createSubtape(Expr node) { auto subtape = New>(); diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 9272e42a3..da69af091 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -676,6 +676,12 @@ class ExpressionGraph : public std::enable_shared_from_this { * @param node a pointer to a expression node */ Expr add(Expr node); + + /** + * Removes the node from the set of roots (will not be initialized during back propagation) + * @param node a pointer to a expression node + */ + void removeAsRoot(Expr node); /** * Allocate memory for the forward pass of the given node. diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 09049f98f..b0d40949b 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -27,6 +27,11 @@ Expr checkpoint(Expr a) { return a; } +Expr removeAsRoot(Expr a) { + a->graph()->removeAsRoot(a); // ugly, hence why hidden here + return a; +} + Expr lambda(const std::vector& nodes, Shape shape, Type type, LambdaNodeFunctor fwd, size_t hash) { return Expression(nodes, shape, type, fwd, hash); diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 5d9ceab36..cc3e6028b 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -16,6 +16,11 @@ Expr debug(Expr a, const std::string& message = ""); */ Expr checkpoint(Expr a); +/** + * Removes the node from the set of root nodes, no-op otherwise + */ +Expr removeAsRoot(Expr node); + typedef Expr(ActivationFunction)(Expr); ///< ActivationFunction has signature Expr(Expr) /** diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h index d5929a6d6..50a785573 100644 --- a/src/layers/guided_alignment.h +++ b/src/layers/guided_alignment.h @@ -26,7 +26,8 @@ guidedAlignmentToSparse(Ptr batch) { std::sort(byIndex.begin(), byIndex.end(), [](const BiPoint& a, const BiPoint& b) { return std::get<0>(a) < std::get<0>(b); }); std::vector indices; std::vector valuesFwd; - indices.reserve(byIndex.size()); valuesFwd.reserve(byIndex.size()); + indices.reserve(byIndex.size()); + valuesFwd.reserve(byIndex.size()); for(auto& p : byIndex) { indices.push_back((IndexType)std::get<0>(p)); valuesFwd.push_back(std::get<1>(p)); @@ -40,28 +41,33 @@ static inline RationalLoss guidedAlignmentCost(Ptr graph, Ptr options, Expr attention) { // [beam depth=1, max src length, batch size, tgt length] std::string guidedLossType = options->get("guided-alignment-cost"); // @TODO: change "cost" to "loss" - + // @TODO: It is ugly to check the multi-loss type here, but doing this right requires + // a substantial rewrite of the multi-loss architecture, which is planned anyways. + std::string multiLossType = options->get("multi-loss-type", "sum"); + // We dropped support for other losses which are not possible to implement with sparse labels. // They were most likely not used anyway. ABORT_IF(guidedLossType != "ce", "Only alignment loss type 'ce' is supported"); float guidedLossWeight = options->get("guided-alignment-weight"); - - auto [indices, values] = guidedAlignmentToSparse(batch); - auto alignmentIndices = graph->indices(indices); - auto alignmentValues = graph->constant({(int)values.size()}, inits::fromVector(values)); - auto attentionAtAligned = cols(flatten(attention), alignmentIndices); - - float epsilon = 1e-6f; - Expr alignmentLoss = -sum(cast(alignmentValues * log(attentionAtAligned + epsilon), Type::float32)); - size_t numLabels = alignmentIndices->shape().elements(); - + const auto& [indices, values] = guidedAlignmentToSparse(batch); + + Expr alignmentLoss; + size_t numLabels = indices.size(); // can be zero + if(indices.empty()) { + removeAsRoot(stopGradient(attention)); // unused, hence make sure we don't polute the backwards operations + alignmentLoss = graph->zeros({1}); + numLabels = multiLossType == "sum" ? 0 : 1; + } else { + float epsilon = 1e-6f; + auto alignmentIndices = graph->indices(indices); + auto alignmentValues = graph->constant({(int)values.size()}, inits::fromVector(values)); + auto attentionAtAligned = cols(flatten(attention), alignmentIndices); + alignmentLoss = -sum(cast(alignmentValues * log(attentionAtAligned + epsilon), Type::float32)); + } // Create label node, also weigh by scalar so labels and cost are in the same domain. // Fractional label counts are OK. But only if combined as "sum". - // @TODO: It is ugly to check the multi-loss type here, but doing this right requires - // a substantial rewrite of the multi-loss architecture, which is planned anyways. - std::string multiLossType = options->get("multi-loss-type", "sum"); - if (multiLossType == "sum") // sum of sums + if (multiLossType == "sum") // sum of sums return RationalLoss(guidedLossWeight * alignmentLoss, guidedLossWeight * numLabels); else return RationalLoss(guidedLossWeight * alignmentLoss, (float)numLabels); From 031dbb32668cf82f767524394f8dd500f6227b0f Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 13 Feb 2023 15:44:19 +0000 Subject: [PATCH 218/254] Merged PR 27804: Fallback to old LSH code for MSVC due to bad loop unrolling The Visual Studio compiler has inferior optimization and loop unrolling to gcc which results in much slower LSH code that was written to explicitly take advantage of loop unrolling at compile time. Added an #ifdef to fall back to old LSH code on MSVC. --- src/layers/lsh.cpp | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index 73d45fc71..eedf227ee 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -1,6 +1,12 @@ +#include "common/timer.h" +#include "common/utils.h" #include "layers/lsh.h" +#include "layers/lsh_impl.h" #include "tensors/tensor_operators.h" -#include "common/utils.h" + +#if _MSC_VER +#include "3rd_party/faiss/Index.h" +#endif #include "3rd_party/faiss/utils/hamming.h" @@ -8,10 +14,6 @@ #include "3rd_party/faiss/VectorTransform.h" #endif -#include "common/timer.h" - -#include "layers/lsh_impl.h" - namespace marian { namespace lsh { @@ -116,7 +118,7 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNR int currBeamSize = encodedQuery->shape()[0]; int batchSize = encodedQuery->shape()[2]; - + auto search = [=](Expr out, const std::vector& inputs) { Expr encodedQuery = inputs[0]; Expr encodedWeights = inputs[1]; @@ -130,6 +132,32 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNR ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently? +#if _MSC_VER // unfortunately MSVC is horrible at loop unrolling, so we fall back to the old code (hrmph!) @TODO: figure this out one day + int qRows = encodedQuery->shape().elements() / bytesPerVector; + + uint8_t* qCodes = encodedQuery->val()->data(); + uint8_t* wCodes = encodedWeights->val()->data(); + + // use actual faiss code for performing the hamming search. + std::vector distances(qRows * dimK); + std::vector ids(qRows * dimK); + faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)dimK, ids.data(), distances.data()}; + faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0); + + // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis. + // The sorting is required as we later do a binary search on those values for reverse look-up. + uint32_t* outData = out->val()->data(); + + int numHypos = out->shape().elements() / dimK; + for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { + size_t startIdx = dimK * hypoIdx; + size_t endIdx = startIdx + dimK; + for(size_t i = startIdx; i < endIdx; ++i) + outData[i] = (uint32_t)ids[i]; + if(!noSort) + std::sort(outData + startIdx, outData + endIdx); + } +#else // this is using the new code for search, other parts of the code, like conversion are fine. IndexType* outData = out->val()->data(); auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { outData[rowId * dimK + k] = kthColId; @@ -144,6 +172,7 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNR params.bytesPerVector = bytesPerVector; hammingTopK(params, gather); +#endif }; Shape kShape({currBeamSize, batchSize, dimK}); From 9871c9007f39578b21152df6469e07e95c6cb9e5 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 20 Feb 2023 20:20:59 +0000 Subject: [PATCH 219/254] Merged PR 27999: Update internal master to public master Pull in changes from public master for sync. No review required. --- .github/workflows/release.yml | 2 +- .github/workflows/windows.yml | 2 +- CHANGELOG.md | 1 + src/common/aliases.cpp | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8a3761e3b..5beab28f0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -158,7 +158,7 @@ jobs: - name: Download MKL run: | - C:\msys64\usr\bin\wget.exe -nv https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip -O mkl.zip + C:\msys64\usr\bin\wget.exe -nv https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip -O mkl.zip Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl # Set the MKLROOT environment variable so that CMake can find MKL. # GITHUB_WORKSPACE is an environment variable available on all GitHub-hosted runners diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index ee85f303d..b1d6b1bd1 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -7,7 +7,7 @@ on: branches: [ master ] env: - MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" + MKL_URL: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip" BOOST_ROOT: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64" BOOST_URL: "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe" diff --git a/CHANGELOG.md b/CHANGELOG.md index 53f81397d..f66f456fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed fp16 training/inference with factors-combine concat method - Fixed clang 13.0.1 compatibility - Fixed potential vulnerabilities from lxml<4.9.1 or mistune<2.0.31 +- Fixed the `--best-deep` RNN alias not setting the s2s model type ### Changed - Parameter synchronization in local sharding model now executes hash checksum before syncing diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 3db31e515..75d9bdf97 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -46,6 +46,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { // Options setting the BiDeep architecture proposed in http://www.aclweb.org/anthology/W17-4710 cli.alias("best-deep", "true", [](YAML::Node& config) { + config["type"] = "s2s"; config["layer-normalization"] = true; config["tied-embeddings"] = true; config["enc-type"] = "alternating"; From 65bf82ffce52f4854295d8b98482534f176d494e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 21 Feb 2023 09:56:29 -0800 Subject: [PATCH 220/254] version 1.12.0 (#980) --- CHANGELOG.md | 3 ++- VERSION | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f66f456fa..3e325e25e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). - ## [Unreleased] +## [1.12.0] - 2023-02-20 + ### Added - Fused inplace-dropout in FFN layer in Transformer - `--force-decode` option for marian-decoder diff --git a/VERSION b/VERSION index 2eac760f5..a5effa303 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.15 +v1.12.0 From efcd3dae71c63036d2b1d5f5992125dabacd2a92 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 23 Feb 2023 06:15:47 +0000 Subject: [PATCH 221/254] Merged PR 28059: Add missing default for factors This adds a missing default for factors, the error does not manifest on the command line since it's set in `config_parser.cpp` --- CHANGELOG.md | 3 +++ VERSION | 2 +- src/common/config.cpp | 15 --------------- src/layers/embedding.cpp | 2 +- src/layers/output.cpp | 30 +++++++++++++++++++++++++++--- src/models/s2s.h | 2 +- src/models/transformer.h | 2 +- 7 files changed, 34 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e325e25e..aa6f06bee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Fixed +- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp) + ## [1.12.0] - 2023-02-20 ### Added diff --git a/VERSION b/VERSION index a5effa303..51b86ba24 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.0 +v1.12.1 diff --git a/src/common/config.cpp b/src/common/config.cpp index 9878c70b0..a1c4ed5ac 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -116,21 +116,6 @@ void Config::initialize(ConfigParser const& cp) { config_["tsv-fields"] = tsvFields; } - // ensures factors backward compatibility whilst keeping the more user friendly CLI - if(get("lemma-dependency").empty()) { - YAML::Node config; - int lemmaDimEmb = get("lemma-dim-emb"); - if(lemmaDimEmb > 0) { - config_["lemma-dependency"] = "re-embedding"; - } else if(lemmaDimEmb == -1) { - config_["lemma-dependency"] = "lemma-dependent-bias"; - } else if(lemmaDimEmb == -2) { - config_["lemma-dependency"] = "soft-transformer-layer"; - } else if(lemmaDimEmb == -3) { - config_["lemma-dependency"] = "hard-transformer-layer"; - } - } - // echo full configuration log(); diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index b60f6cc18..334f0b865 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -179,7 +179,7 @@ Expr Embedding::applyIndices(const std::vector& embIdx, const Shape& : prefix_ + "_Wemb", "fixed", embeddingFix_, "dimFactorEmb", opt("factors-dim-emb", 0), // for factored embeddings - "factorsCombine", opt("factors-combine", ""), // for factored embeddings + "factorsCombine", opt("factors-combine", "sum"), // for factored embeddings "vocab", opt>("vocabs")[batchIndex_]); // for factored embeddings // clang-format on if(options_->hasAndNotEmpty("embedding-vectors")) { diff --git a/src/layers/output.cpp b/src/layers/output.cpp index efff58df4..8977464b1 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -6,6 +6,28 @@ namespace marian { namespace mlp { +// @TODO: get rid of factored code altogether +static std::string getLemmaDependency(int lemmaDimEmb, const std::string& lemmaDependencyIn) { + // ensures factors backward compatibility whilst keeping the more user friendly CLI + std::string lemmaDependencyOut; + if(lemmaDependencyIn.empty()) { + if(lemmaDimEmb > 0) { + lemmaDependencyOut = "re-embedding"; + } else if(lemmaDimEmb == -1) { + lemmaDependencyOut = "lemma-dependent-bias"; + } else if(lemmaDimEmb == -2) { + lemmaDependencyOut = "soft-transformer-layer"; + } else if(lemmaDimEmb == -3) { + lemmaDependencyOut = "hard-transformer-layer"; + } else { + lemmaDependencyOut = ""; + } + } else { + lemmaDependencyOut = lemmaDependencyIn; + } + return lemmaDependencyOut; +} + /*private*/ void Output::lazyConstruct(int inputDim) { // We must construct lazily since we won't know tying nor input dim in constructor. if(Wt_) @@ -36,7 +58,8 @@ namespace mlp { b_ = graph_->param(name + "_b", {1, numOutputClasses}, inits::zeros()); /*const*/ int lemmaDimEmb = options_->get("lemma-dim-emb", 0); - std::string lemmaDependency = options_->get("lemma-dependency", ""); + std::string lemmaDependency = getLemmaDependency(lemmaDimEmb, options_->get("lemma-dependency", "")); + ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary"); if(lemmaDependency == "re-embedding") { // embed the (expected) word with a different embedding matrix ABORT_IF( @@ -112,7 +135,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { Expr Plemma = nullptr; // used for lemmaDependency = lemma-dependent-bias Expr inputLemma = nullptr; // used for lemmaDependency = hard-transformer-layer and soft-transformer-layer - std::string factorsCombine = options_->get("factors-combine", ""); + std::string factorsCombine = options_->get("factors-combine", "sum"); ABORT_IF(factorsCombine == "concat", "Combining lemma and factors embeddings with concatenation on the target side is currently not supported"); for(size_t g = 0; g < numGroups; g++) { @@ -134,7 +157,8 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { factorB = slice(b_, -1, Slice((int)range.first, (int)range.second)); } /*const*/ int lemmaDimEmb = options_->get("lemma-dim-emb", 0); - std::string lemmaDependency = options_->get("lemma-dependency", ""); + std::string lemmaDependency = getLemmaDependency(lemmaDimEmb, options_->get("lemma-dependency", "")); + if((lemmaDependency == "soft-transformer-layer" || lemmaDependency == "hard-transformer-layer") && g > 0) { // this mimics one transformer layer // - attention over two inputs: diff --git a/src/models/s2s.h b/src/models/s2s.h index 104f946c9..8eb2ef8d1 100644 --- a/src/models/s2s.h +++ b/src/models/s2s.h @@ -319,7 +319,7 @@ class DecoderS2S : public DecoderBase { last("vocab", opt>("vocabs")[batchIndex_]); // for factored outputs last("lemma-dim-emb", opt("lemma-dim-emb", 0)); // for factored outputs last("lemma-dependency", opt("lemma-dependency", "")); // for factored outputs - last("factors-combine", opt("factors-combine", "")); // for factored outputs + last("factors-combine", opt("factors-combine", "sum")); // for factored outputs last("output-omit-bias", opt("output-omit-bias", false)); diff --git a/src/models/transformer.h b/src/models/transformer.h index 243d2c7fc..1fed868b6 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -689,7 +689,7 @@ class DecoderTransformer : public Transformer { "output-approx-knn", opt>("output-approx-knn", {}), "lemma-dim-emb", opt("lemma-dim-emb", 0), // for factored outputs "lemma-dependency", opt("lemma-dependency", ""), // for factored outputs - "factors-combine", opt("factors-combine", "")); // for factored outputs + "factors-combine", opt("factors-combine", "sum")); // for factored outputs if(opt("tied-embeddings") || opt("tied-embeddings-all")) outputFactory.tieTransposed(opt("tied-embeddings-all") || opt("tied-embeddings-src") ? "Wemb" : prefix_ + "_Wemb"); From a23cc77e5f7132e405e99dcdcf657f5aceace08a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 27 Feb 2023 21:53:41 +0000 Subject: [PATCH 222/254] Merged PR 27976: Introduce new layer framework into master Introduces the new layer framework into Marian master. This is currently not used anywhere for the existing models unless explicitly asked for. This also shouldn't modify any major existing functionality. The goal of this PR is to have the new code in master and use it for new things instead of the old code. FYI: @ @ the files in `src/layers_new` are the new framework. The rest is mostly unchanged apart from small modifications that allow for interaction with the new code. For now it exists in parallel to the old code. --- .gitignore | 3 +- CHANGELOG.md | 4 + VERSION | 2 +- src/CMakeLists.txt | 2 + src/command/marian_conv.cpp | 1 - src/common/utils.cpp | 30 ++ src/common/utils.h | 16 + src/graph/cached_expression.h | 70 ++++ src/graph/expression_operators.cpp | 7 +- src/graph/expression_operators.h | 2 +- src/graph/node_operators_binary.h | 45 ++- src/layers_new/attention.h | 192 ++++++++++ src/layers_new/decoder.h | 136 +++++++ src/layers_new/embeddings.h | 239 ++++++++++++ src/layers_new/interface.h | 550 ++++++++++++++++++++++++++ src/layers_new/neuralnet.cpp | 24 ++ src/layers_new/neuralnet.h | 300 +++++++++++++++ src/layers_new/rnn.h | 126 ++++++ src/layers_new/transformer.h | 553 +++++++++++++++++++++++++++ src/models/model_factory.cpp | 46 ++- src/models/s2s.h | 8 +- src/models/states.h | 36 +- src/models/transformer.h | 44 +-- src/models/transformer_factory.h | 162 ++++++++ src/models/transformer_new.h | 245 ++++++++++++ src/tensors/cpu/tensor_operators.cpp | 7 +- src/tests/CMakeLists.txt | 1 + src/tests/transformer_new.cpp | 11 + 28 files changed, 2754 insertions(+), 108 deletions(-) create mode 100644 src/graph/cached_expression.h create mode 100644 src/layers_new/attention.h create mode 100644 src/layers_new/decoder.h create mode 100644 src/layers_new/embeddings.h create mode 100644 src/layers_new/interface.h create mode 100644 src/layers_new/neuralnet.cpp create mode 100644 src/layers_new/neuralnet.h create mode 100644 src/layers_new/rnn.h create mode 100644 src/layers_new/transformer.h create mode 100644 src/models/transformer_new.h create mode 100644 src/tests/transformer_new.cpp diff --git a/.gitignore b/.gitignore index 956ce6847..d7f2f4df3 100644 --- a/.gitignore +++ b/.gitignore @@ -61,5 +61,4 @@ examples/mnist/*ubyte /vs/MarianDll.VC.VC.opendb .vs -.vscode - +.vscode diff --git a/CHANGELOG.md b/CHANGELOG.md index aa6f06bee..6a7316be9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- New experimental layer framework for Transformer-like models. + ### Fixed - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp) diff --git a/VERSION b/VERSION index 51b86ba24..41de27dfa 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.1 +v1.12.2 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f095f2eb8..f9d5a5e5b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -80,6 +80,8 @@ set(MARIAN_SOURCES layers/logits.cpp layers/lsh.cpp + layers_new/neuralnet.cpp + rnn/cells.cpp rnn/attention.cpp diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index b4a5f3745..12412a238 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -8,7 +8,6 @@ int main(int argc, char** argv) { using namespace marian; - createLoggers(); auto options = New(); diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 1f3fd6c07..c058d4874 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -440,3 +440,33 @@ double parseNumber(std::string param) { } // namespace utils } // namespace marian + + +// Code for demangling gnu g++ type names, closing/re-opening namespaces to keep things local +// This is used to determine Layer type names for display and nameing. +#ifdef __GNUG__ +#include +#endif + +namespace marian { +namespace utils { + +#ifdef __GNUG__ // gnu g++ and clang seem to do this similarly +std::string cxxTypeNameDemangle(const char* name) { + int status = -4; // some arbitrary value to eliminate the compiler warning + // __cxa_demangle allocates a string that has to be freed, we pass the deallocation function + std::unique_ptr res( + abi::__cxa_demangle(name, NULL, NULL, &status), + std::free + ); + return (status == 0) ? res.get() : name; +} +#else +// does nothing if not g++, should be correct for MSVC +std::string cxxTypeNameDemangle(const char* name) { + return name; +} +#endif + +} // namespace utils +} // namespace marian diff --git a/src/common/utils.h b/src/common/utils.h index fbcf672d7..5f3695fc0 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace marian { @@ -66,6 +67,21 @@ std::string findReplace(const std::string& in, const std::string& what, const st double parseDouble(std::string s); double parseNumber(std::string s); +std::string cxxTypeNameDemangle(const char* name); + +// return type name via object of given type +template +std::string cxxTypeName(const T& t) { + return cxxTypeNameDemangle(typeid(t).name()); +} + +// return type name via templated type +template +std::string cxxTypeName() { + return cxxTypeNameDemangle(typeid(T).name()); +} + + // prints vector values with a custom label. template void Debug(const T *arr, size_t size, const std::string &str) { diff --git a/src/graph/cached_expression.h b/src/graph/cached_expression.h new file mode 100644 index 000000000..f7adff8bc --- /dev/null +++ b/src/graph/cached_expression.h @@ -0,0 +1,70 @@ +#include "common/definitions.h" +#include "common/intrusive_ptr.h" +#include "graph/expression_graph.h" + +#include + +namespace marian { + +// This class allows for simpler caching of Expr objects and automatic checking if the +// cached Expr needs to be updated/recreated. +class CachedExpr { + private: + ENABLE_INTRUSIVE_PTR(CachedExpr); + + Expr cachedKey_{nullptr}; + Expr cachedValue_{nullptr}; + + typedef std::function ApplyFunT; + typedef std::function EqualFunT; + + UPtr applyFun_; // function that creates the cached result + UPtr equalFun_; // function that checks if the input changed. If yes, + // the `apply_` functions gets reapplied and the new result + // is cached. + + public: + // No functors are given; they will have to supplied when calling `apply`. + CachedExpr() {}; + + // No apply functor is given; it will have to supplied when calling `apply`. + CachedExpr(EqualFunT equalFun) + : equalFun_(new EqualFunT(equalFun)) {}; + + // Both functors are given, and will be used by default. They can however be overriden + // if supplied directly in `apply`. + CachedExpr(ApplyFunT applyFun, EqualFunT equalFun) + : applyFun_(new ApplyFunT(applyFun)), equalFun_(new EqualFunT(equalFun)) {}; + + // lazily executes the factory `applyFun` if `equalFun` indicates that the input has changed. + Expr apply(Expr key, ApplyFunT applyFun, EqualFunT equalFun) { + if(!cachedKey_ || !equalFun(cachedKey_, key)) { + cachedKey_ = key; + cachedValue_ = applyFun(key); + } + return cachedValue_; + } + + // lazily executes the factory `applyFun` if a equality check that has been passed to the constructor + // indicates that the input has changed. + Expr apply(Expr key, ApplyFunT applyFun) { + ABORT_IF(!equalFun_, "Equality check has not been passed to constructor"); + return apply(key, applyFun, *equalFun_); + } + + // lazily executes a factory if a equality check indicates that the input has changed. Both, + // the factory and the equality check have to have been passed to the constructor. + Expr apply(Expr key) { + ABORT_IF(!equalFun_, "Equality check has not been passed to constructor"); + ABORT_IF(!applyFun_, "Apply factory has not been passed to constructor"); + return apply(key, *applyFun_, *equalFun_); + } + + // clears any cached values, calling apply after this will trigger recomputation. + void clear() { + cachedKey_ = nullptr; + cachedValue_ = nullptr; + } +}; + +} diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index b0d40949b..a6504ebac 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -759,8 +759,7 @@ Expr transpose(Expr a, const std::vector& axes) { return Expression(a, axes); } -Expr swapAxes(Expr x, int axis1, int axis2) -{ +Expr swapAxes(Expr x, int axis1, int axis2) { const auto& shape = x->shape(); axis1 = shape.axis(axis1); axis2 = shape.axis(axis2); @@ -880,8 +879,8 @@ Expr rmsNorm(Expr x, return Expression(nodes, eps); } -Expr highway(Expr y, Expr x, Expr t) { - std::vector nodes = {y, x, t}; +Expr highway(Expr input1, Expr input2, Expr gate) { + std::vector nodes = {input1, input2, gate}; return Expression(nodes); } diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index cc3e6028b..faef5c29e 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -976,7 +976,7 @@ static inline Expr dropout(Expr x, float dropProb, Shape shape) { /** - * Performs dropout with a given probably. + * Performs dropout with a given probability. */ static inline Expr dropout(Expr x, float dropProb) { if(dropProb == 0) diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 292554bd0..2c997d577 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -577,8 +577,8 @@ class DotBatchedNodeOp : public NaryNodeOp { // df/dB += alpha * dot(op(A).T, D) // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C // to sum gradients from different graph parts - - if(!transA_ && transB_) + + if(!transA_ && transB_) { return {NodeOp(ProdBatched(child(0)->grad(), graph()->allocator(), adj_, @@ -595,8 +595,7 @@ class DotBatchedNodeOp : public NaryNodeOp { false, 1.0, scalar_))}; - - if(transA_ && !transB_) + } else if(transA_ && !transB_) { return {NodeOp(ProdBatched(child(0)->grad(), graph()->allocator(), child(1)->val(), @@ -613,8 +612,7 @@ class DotBatchedNodeOp : public NaryNodeOp { false, 1.0, scalar_))}; - - if(transA_ && transB_) + } else if(transA_ && transB_) { return {NodeOp(ProdBatched(child(0)->grad(), graph()->allocator(), child(1)->val(), @@ -631,23 +629,24 @@ class DotBatchedNodeOp : public NaryNodeOp { true, 1.0, scalar_))}; - - return {NodeOp(ProdBatched(child(0)->grad(), - graph()->allocator(), - adj_, - child(1)->val(), - false, - true, - 1.0, - scalar_)), - NodeOp(ProdBatched(child(1)->grad(), - graph()->allocator(), - child(0)->val(), - adj_, - true, - false, - 1.0, - scalar_))}; + } else { // !transA && !transB + return {NodeOp(ProdBatched(child(0)->grad(), + graph()->allocator(), + adj_, + child(1)->val(), + false, + true, + 1.0, + scalar_)), + NodeOp(ProdBatched(child(1)->grad(), + graph()->allocator(), + child(0)->val(), + adj_, + true, + false, + 1.0, + scalar_))}; + } } const std::string type() override { return "bdot"; } diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h new file mode 100644 index 000000000..035e6c51d --- /dev/null +++ b/src/layers_new/attention.h @@ -0,0 +1,192 @@ +#pragma once + +#include "graph/cached_expression.h" +#include "layers_new/neuralnet.h" + +namespace marian { +namespace nn { + +// Abstract base class for attention mechanisms +class AttentionLayer : public Layer, + public IQuaternaryLayer { +protected: + using Layer::namedLayers_; + +public: + AttentionLayer(Ptr graph) : Layer(graph) {} + virtual ~AttentionLayer() = default; +}; + +class MultiplicativeAttention : public AttentionLayer { +protected: + using AttentionLayer::namedLayers_; + +public: + Ptr attentionDropout; + + MultiplicativeAttention(Ptr graph, float dropoutProbability) + : AttentionLayer(graph) { + attentionDropout = New(graph, dropoutProbability); + registerLayer(attentionDropout); + } + + virtual ~MultiplicativeAttention() = default; + + virtual Expr apply(Expr query, Expr keys, Expr values, Expr logMask = nullptr) const override { + int dimKeys = keys->shape()[-1]; + + // softmax over batched dot product of query and keys (applied over all + // time steps and batch entries), also add logMask for illegal connections + + // multiplicative attention with flattened softmax + float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication + + // query, keys and values: [beam depth * batch size, num heads, length, head dim] + auto z = bdot(query, keys, false, true, scale); // [beam depth, batch size * num heads, max tgt length, max src length] + + // mask out garbage beyond end of sequences + if(logMask) + z = z + logMask; + + // take softmax along src sequence axis (-1) + auto weights = softmax(z); // [beam depth, batch size * num heads, max tgt length, max src length] + +#if 0 // @TODO: make this work again + if(saveAttentionWeights) + collectOneHead(weights, dimBeam); +#endif + + // optional dropout for attention weights + weights = attentionDropout->apply(weights); + + // apply attention weights to values + // weights: [beam depth, batch size * num heads, max tgt length, max src length] + // values: [beam depth, batch size * num heads, src length, head dim] + auto output = bdot(weights, values); // [beam depth, batch size * num heads, max tgt length, split vector dim] + return output; + } +}; + +template // Currently only used for MultiplicativeAttention +class MultiHeadAttention : public AttentionType { +protected: + using AttentionType::namedLayers_; + +private: + IPtr cachedKh_; // cached result of key projection + IPtr cachedVh_; // cached result of value projection + +public: + Ptr qProj; // query projection layer + Ptr kProj; // key projection layer + Ptr vProj; // value projection layer + Ptr oProj; // output projection layer + + int numHeads; + int attDim; + int modelDim; + + MultiHeadAttention(Ptr graph, + int numHeads, + int attDim, + int modelDim, + float dropoutProbability) + : AttentionType(graph, dropoutProbability), + cachedKh_(new CachedExpr()), + cachedVh_(new CachedExpr()), + numHeads(numHeads), + attDim(attDim), + modelDim(modelDim) { + qProj = New(graph, attDim); + registerLayer(qProj); + kProj = New(graph, attDim); + registerLayer(kProj); + vProj = New(graph, attDim); + registerLayer(vProj); + + oProj = New(graph, modelDim); + registerLayer(oProj); + } + + virtual ~MultiHeadAttention() = default; + +private: + // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to + // be able to do an efficient batched matmul. + Expr splitHeads(Expr input) const { + int dimSteps = input->shape()[-2]; + int dimBatch = input->shape()[-3]; + int dimBeam = input->shape()[-4]; + int dimDepth = attDim / numHeads; + + auto output = reshape(input, {dimBeam * dimBatch, dimSteps, numHeads, dimDepth}); + output = transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, numHeads, dimSteps, dimDepth] + output = reshape(output, {dimBeam, dimBatch * numHeads, dimSteps, dimDepth}); + return output; + } + + // Undoes the effects of the above function by reversing the transposition and reshaping back to original shape + Expr joinHeads(Expr input) const { + int dimDepth = input->shape()[-1]; + int dimSteps = input->shape()[-2]; + int dimBatchHeads = input->shape()[-3]; + int dimBeam = input->shape()[-4]; + int dimModel = numHeads * dimDepth; + int dimBatch = dimBatchHeads / numHeads; + + auto output = reshape(input, {dimBeam * dimBatch, numHeads, dimSteps, dimDepth}); + output = transpose(output, {0, 2, 1, 3}); + output = reshape(output, {dimBeam, dimBatch, dimSteps, dimModel}); + return output; + } + +public: + virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override { + auto qh = splitHeads(qProj->apply(query)); + + // @TODO: in original implementation we use shape()->elements(), dunno why + auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); }; + + // these two get conditionally recomputed if their size changes according to criterion above + auto kh = cachedKh_->apply(keys, [this](Expr keys) { + return splitHeads(kProj->apply(keys)); + }, equal); + + auto vh = cachedVh_->apply(values, [this](Expr values) { + return splitHeads(vProj->apply(values)); + }, equal); + + auto output = AttentionType::apply(qh, kh, vh, mask); + + output = joinHeads(output); + output = oProj->apply(output); + + return output; + } + + virtual void clear() override { + Layer::clear(); + cachedKh_->clear(); + cachedVh_->clear(); + } +}; + +static Ptr attentionFromOptions(Ptr graph, Ptr options) { + // @TODO: currently this does nothing as it isn't set anywhere + std::string selfAttentionType = options->get("transformer-encoder-attention", "default"); // currently only default + + // in the future we might add SingleHead or Additive or LSH-based as in Reformer + if(selfAttentionType == "default") { + int numHeads = options->get("transformer-heads"); + int modelDim = options->get("dim-emb"); + float attentionDropoutProbability = options->get("transformer-dropout-attention", 0.f); + + return New>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability); + } + else { + ABORT("Unknown transformer encoder attention type: {}", selfAttentionType); + } +} + +} // namespace nn +} // namespace marian diff --git a/src/layers_new/decoder.h b/src/layers_new/decoder.h new file mode 100644 index 000000000..406017d64 --- /dev/null +++ b/src/layers_new/decoder.h @@ -0,0 +1,136 @@ +#pragma once + +#include "common/utils.h" +#include "graph/expression_graph.h" +#include "graph/expression_operators.h" +#include "graph/node_initializers.h" + +#include "layers_new/interface.h" + +namespace marian { +namespace nn { + +// Interface: decoder state +struct DecoderState : public IClassName, public std::enable_shared_from_this { +protected: + size_t position{0}; + +public: + DecoderState(size_t position) : position(position) {} + virtual ~DecoderState() {} + + virtual void incrementPosition() { + position++; + } + + virtual size_t getPosition() { + return position; + } + + // Dynamic cast to requested layer type. Will return nullptr if not possible + template + Ptr as() { + return std::dynamic_pointer_cast(shared_from_this()); + } + + // Dynamic cast to requested layer type. Will return nullptr if not possible + template + Ptr as() const { + return const_cast(this)->as(); + } + + // Dynamic cast to requested layer type. Will abort if the cast is not possible. + template + Ptr cast() { + auto stateCast = as(); + ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}", + className(), + utils::cxxTypeName()); + return stateCast; + } + + template + Ptr cast() const { + return const_cast(this)->cast(); + } +}; + +class DecoderStateItem : public DecoderState { +private: + Expr state_; + +public: + DecoderStateItem(Expr state, size_t position) : DecoderState(position), state_(state) {} + virtual ~DecoderStateItem() = default; + + Expr get() { return state_; } + void set(Expr state) { state_ = state; } +}; + +class DecoderStateList : public DecoderState { +private: + std::vector> items_; + +public: + DecoderStateList(size_t position) : DecoderState(position) {} + virtual ~DecoderStateList() = default; + + void incrementPosition() override { + DecoderState::incrementPosition(); + for(auto& item : items_) { + item->incrementPosition(); + ABORT_IF(position != item->getPosition(), "Positions out of sync??"); + } + } + + void append(Ptr item) { + ABORT_IF(position != item->getPosition(), "DecoderStateList.position ({}) != DecoderStateItem.position ({}) ?", position, item->getPosition()); + items_.push_back(item); + } + + /** + * Retrieve DecoderStateItem at index i + */ + Ptr at(size_t i) const { + return items_[i]; + } + + auto begin() -> decltype(items_.begin()) const { + return items_.begin(); + } + + auto end() -> decltype(items_.end()) const { + return items_.end(); + } + + size_t size() const { return items_.size(); } +}; + + +// Interface: Unary function +struct IUnaryDecoderLayer { + virtual Expr apply(Expr /*input*/, Ptr /*state*/) const = 0; +}; + +// Interface: Binary function +struct IBinaryDecoderLayer { + virtual Expr apply(Expr, Expr, Ptr /*state*/) const = 0; +}; + +// Interface: Ternary function +struct ITernaryDecoderLayer { + virtual Expr apply(Expr, Expr, Expr, Ptr /*state*/) const = 0; +}; + +// Interface: 4ary function +struct IQuaternaryDecoderLayer { + virtual Expr apply(Expr, Expr, Expr, Expr, Ptr /*state*/) const = 0; +}; + +// Interface: N-Ary function +struct INaryLayerDecoderLayer { + virtual Expr apply(const std::vector& /*inputs*/, Ptr /*state*/) const = 0; +}; + +} // namespace nn +} // namespace marian diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h new file mode 100644 index 000000000..b7d297b63 --- /dev/null +++ b/src/layers_new/embeddings.h @@ -0,0 +1,239 @@ +#pragma once + +#include "layers_new/interface.h" +#include "data/corpus_base.h" +#include "data/factored_vocab.h" + +namespace marian { +namespace nn { + +// Embedding from corpus sub-batch to (emb, mask) +struct IEmbeddingLayer { + virtual std::tuple apply(Ptr subBatch) const = 0; + + virtual Expr apply(const Words& embIdx, const Shape& shape) const = 0; + + // alternative from indices directly + virtual Expr applyIndices(const std::vector& embIdx, const Shape& shape) const = 0; +}; + +struct IPositionEmbeddingLayer { + virtual Expr apply(Expr, int startPosition = 0) = 0; +}; + +// A regular embedding layer. +// Note that this also applies dropout if the option is passed (pass 0 when in inference mode). +// It is best to not use Embedding directly, but rather via getEmbeddingLayer() in +// EncoderDecoderLayerBase, which knows to pass on all required parameters from options. +class Embedding : public LayerWithOptions, public IEmbeddingLayer { +public: + Expr embeddings; + + Embedding(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { + std::string name = opt("prefix"); + int dimVoc = opt("dimVocab"); + int dimEmb = opt("dimEmb"); + bool fixed = opt("fixed", false); + + factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get("vocab", "")); + if (factoredVocab_) { + dimVoc = (int)factoredVocab_->factorVocabSize(); + LOG_ONCE(info, "[embedding] Factored embeddings enabled"); + } + + // Embedding layer initialization should depend only on embedding size, hence fanIn=false + auto initFunc = inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length + + if(options_->has("embFile")) { + std::string file = opt("embFile"); + if (!file.empty()) { + bool norm = opt("normalization", false); + initFunc = inits::fromWord2vec(file, dimVoc, dimEmb, norm); + } + } + + registerParameter(embeddings, Shape({dimVoc, dimEmb}), initFunc); + embeddings->setTrainable(!fixed); // @TODO: move into registerParam macro + } + + virtual ~Embedding() = default; + + std::tuple apply(Ptr subBatch) const override final { + auto graph = embeddings->graph(); + int dimBatch = (int)subBatch->batchSize(); + int dimEmb = embeddings->shape()[-1]; + int dimWidth = (int)subBatch->batchWidth(); + + // factored embeddings: + // - regular: + // - y = x @ E x:[B x 1ofV] ; E:[V x D] ; y:[B x D] + // - factored: + // - u = x @ M one-hot to U-dimensional multi-hot (all factors in one concatenated space) + // - each row of M contains the set of factors for one word => we want a CSR matrix + // - y = (x @ M) @ E (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D] + // - first compute x @ M on the CPU + // - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()): + // - shape (U, specifically) not actually needed here + // - foreach input x[i] + // - locate row M[i,*] + // - copy through its index values (std::vector) + // - create a matching ones vector (we can keep growing) + // - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x) + // - CSR matrix product with E + // - csr_dot(Uvalues, Uindices, Uoffsets, embeddings, transposeU) + // - double-check if all dimensions are specified. Probably not for transpose (which would be like csc_dot()). + // - weighting: + // - core factors' gradients are sums over all words that use the factors; + // - core factors' embeddings move very fast + // - words will need to make up for the move; rare words cannot + // - so, we multiply each factor with 1/refCount + // - core factors get weighed down a lot + // - no impact on gradients, as Adam makes up for it; embeddings still move fast just as before + // - but forward pass weighs them down, so that all factors are in a similar numeric range + // - if it is required to be in a different range, the embeddings can still learn that, but more slowly + + auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb}); + auto batchMask = graph->constant({dimWidth, dimBatch, 1}, + inits::fromVector(subBatch->mask())); + return std::make_tuple(batchEmbeddings, batchMask); + } + + Expr apply(const Words& words, const Shape& shape) const override final { + if (factoredVocab_) { + Expr selectedEmbs = multiRows(words, opt("dropout", 0.0f)); // [(B*W) x E] + selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E] + return selectedEmbs; + } + else + return applyIndices(toWordIndexVector(words), shape); + } + + Expr applyIndices(const std::vector& embIdx, const Shape& shape) const override final { + ABORT_IF(factoredVocab_, "Embedding: applyIndices must not be used with a factored vocabulary"); + auto selectedEmbs = rows(embeddings, embIdx); // [(B*W) x E] + selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E] + // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape() (test that separately) + selectedEmbs = dropout(selectedEmbs, opt("dropout", 0.0f), { selectedEmbs->shape()[-3], 1, 1 }); + return selectedEmbs; + } + +private: + Ptr factoredVocab_; + + // helper to embed a sequence of words (given as indices) via factored embeddings + Expr multiRows(const Words& data, float dropProb) const { + auto graph = embeddings->graph(); + auto factoredData = factoredVocab_->csr_rows(data); + // multi-hot factor vectors are represented as a sparse CSR matrix + // [row index = word position index] -> set of factor indices for word at this position + ABORT_IF(factoredData.shape != Shape({(int)factoredData.offsets.size()-1/*=rows of CSR*/, embeddings->shape()[0]}), "shape mismatch??"); + // the CSR matrix is passed in pieces + auto weights = graph->constant({ (int)factoredData.weights.size() }, inits::fromVector(factoredData.weights), Type::float32); + auto indices = graph->constant({ (int)factoredData.indices.size() }, inits::fromVector(factoredData.indices), Type::uint32); + auto offsets = graph->constant({ (int)factoredData.offsets.size() }, inits::fromVector(factoredData.offsets), Type::uint32); + // apply dropout + // We apply it to the weights, i.e. factors get dropped out separately, but always as entire vectors. + weights = dropout(weights, dropProb); + // perform the product + return csr_dot(factoredData.shape, weights, indices, offsets, embeddings); + } +}; + +// Abstract base class for position embedding layers +struct PositionEmbeddingLayer : public Layer, + public IPositionEmbeddingLayer { + using Layer::namedLayers_; + using Layer::namedParameters_; + using Layer::param; + + int positionAxis; + int maxLength; + + PositionEmbeddingLayer(Ptr graph, int positionAxis, int maxLength) + : Layer(graph), positionAxis(positionAxis), maxLength(maxLength) {} + + virtual ~PositionEmbeddingLayer() = default; +}; + +struct SinusoidalPositionEmbedding : public PositionEmbeddingLayer { + using PositionEmbeddingLayer::positionAxis; + using PositionEmbeddingLayer::maxLength; + + SinusoidalPositionEmbedding(Ptr graph, int positionAxis) + : PositionEmbeddingLayer(graph, positionAxis, /*maxLength=*/-1) + {} + + virtual ~SinusoidalPositionEmbedding() = default; + + Expr apply(Expr input, int start = 0) override { + int dimEmb = input->shape()[-1]; + int dimWords = input->shape()[positionAxis]; + + input = std::sqrt((float)dimEmb) * input; // input were initialized to unit length; so norms will be in order of sqrt(dimEmb) + + Shape posEmbeddingShape; + posEmbeddingShape.resize(input->shape().size()); // resize to input shape size and fill with 1s + posEmbeddingShape.set(-1, dimEmb); // match embedding size + posEmbeddingShape.set(positionAxis, dimWords); // match number of items to embed on correct axis + + // the node initializer is dimension agnostic for dimensions other than the last + // dimension (embedding dimension) and works with any positionAxis value + auto posEmbeddings = graph()->constant(posEmbeddingShape, + inits::sinusoidalPositionEmbeddings(start)); + + input = input + posEmbeddings; + return input; + } +}; + +struct LearnedPositionEmbedding : public PositionEmbeddingLayer { + using PositionEmbeddingLayer::positionAxis; + using PositionEmbeddingLayer::maxLength; + + Expr embeddings; + + LearnedPositionEmbedding(Ptr graph, int positionAxis, int maxLength) + : PositionEmbeddingLayer(graph, positionAxis, maxLength) + {} + + virtual ~LearnedPositionEmbedding() = default; + + Expr apply(Expr input, int start = 0) override { + int dimEmb = input->shape()[-1]; + int dimWords = input->shape()[positionAxis]; + + registerParameter(embeddings, + Shape({maxLength, dimEmb}), + inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true)); + + ABORT_IF(start + dimWords > maxLength, + "Number of positions ({}) starting at position {} exceeds maximum length {}", + dimWords, start, maxLength); + + Shape posEmbeddingShape; + posEmbeddingShape.resize(input->shape().size()); // resize to input shape size and fill with 1s + posEmbeddingShape.set(-1, dimEmb); // match embedding size + posEmbeddingShape.set(positionAxis, dimWords); // match number of items to embed on correct axis + + auto posEmbeddings = slice(embeddings, -2, Slice(start, start + dimWords)); + posEmbeddings = reshape(posEmbeddings, posEmbeddingShape); + + input = input + posEmbeddings; + return input; + } +}; + +static Ptr positionEmbeddingFromOptions(Ptr graph, + Ptr options, + int positionAxis) { + bool trainedEmbedding = options->get("transformer-train-position-embeddings", false); + if(trainedEmbedding) { + int maxLength = options->get("max-length"); + return New(graph, positionAxis, maxLength); + } else { + return New(graph, positionAxis); + } +} + +} // namespace nn +} // namespace marian diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h new file mode 100644 index 000000000..d8317d610 --- /dev/null +++ b/src/layers_new/interface.h @@ -0,0 +1,550 @@ +#pragma once + +#include "common/utils.h" +#include "graph/expression_graph.h" +#include "graph/expression_operators.h" +#include "graph/node_initializers.h" + +#include + +namespace marian { +namespace nn { + +// Interface: provides a class member to return the class name (type) as a string +struct IClassName { + virtual std::string className() const { + return utils::cxxTypeName(*this); + } +}; + +// Interface: Unary function +struct IUnaryLayer { + virtual Expr apply(Expr) const = 0; +}; + +// Interface: Binary function +struct IBinaryLayer { + virtual Expr apply(Expr, Expr) const = 0; +}; + +// Interface: Ternary function +struct ITernaryLayer { + virtual Expr apply(Expr, Expr, Expr) const = 0; +}; + +// Interface: 4ary function +struct IQuaternaryLayer { + virtual Expr apply(Expr, Expr, Expr, Expr) const = 0; +}; + +// Interface: N-Ary function +struct INaryLayer { + virtual Expr apply(const std::vector& list) const = 0; +}; + +// Interface: implement a clearing function +struct IClearable { + virtual void clear() = 0; +}; + + +// Helper macro to turn parameter C++ variable name into a string. +#define registerParameter(paramArg, shape, init) \ +do { \ + if(!paramArg) { \ + paramArg = this->param(#paramArg, shape, init); \ + } \ +} while(0); + +// Helper macro to turn parameter C++ variable name into a string. +// This version is meant to be used in apply(...) functions for lazy parameter inits +// hence has to cast away constness. +#define registerParameterLazy(paramArg, shape, init) \ +do { \ + using ThisLayerType = std::decay::type; \ + ThisLayerType* thisLayer = const_cast(this); \ + if(!thisLayer->paramArg) { \ + thisLayer->paramArg = thisLayer->param(#paramArg, shape, init); \ + } \ +} while(0); + +// Helper macro to turn a layer C++ variable name into a string and to add the layer as a named sublayer to the parent layer +#define registerLayer(layerArg) \ +do { \ + ABORT_IF(!layerArg, "Layer {} of type {} is not initialized", #layerArg, utils::cxxTypeName(layerArg)); \ + namedLayers_.emplace_back(#layerArg, layerArg); \ + if(!layerArg->registered()) { \ + layerArg->setName(#layerArg); \ + layerArg->setFirstParent(this); \ + } \ +} while(0); + +// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as +// the C++ variable name itself is not used a name string. +#define registerLayerWithName(layerArg, name) \ +do { \ + ABORT_IF(!layerArg, "Layer {} of type {} with name {} is not initialized", #layerArg, utils::cxxTypeName(layerArg), name); \ + namedLayers_.emplace_back(name, layerArg); \ + if(!layerArg->registered()) { \ + layerArg->setName(name); \ + layerArg->setFirstParent(this); \ + } \ +} while(0); + +class Layer; + +using NamedParameter = std::pair; + +template +using NamedLayer = std::pair>; + +// Base class for all layers. Sub layers should inherit from this class and one or multiple of the interfaces (e.g. IUnaryLayer) +class Layer : public IClassName, public IClearable, public std::enable_shared_from_this { +public: + enum class Mode : int { eval, train }; + +private: + Weak graph_; + + // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr + // as registration happens in constructor of parent layer and shared_from_this() + // cannot be used before parent layer constructor exits. + Layer* firstParent_{nullptr}; + std::string name_; + + mutable Mode mode_{Mode::train}; // eval or train ? + +protected: + std::vector namedParameters_; // vector of all named parameters belonging to this specific layer (not recurisve) + std::vector> namedLayers_; // vector of all named sublayers for this specific layer (not recursive) + + // Create a layer parameter with a full name composed of the path to this layer and localName + Expr param(const std::string& localName, const Shape& shape, const Ptr& init) { + std::string fullName = fmt::format("{}->{}", path(), localName); + auto parameter = graph()->param(fullName, shape, init); + namedParameters_.emplace_back(localName, parameter); + return parameter; + } + +public: + Layer(Ptr graph) + : graph_(graph) {} + + virtual ~Layer() = default; + + Ptr graph() { + auto graph = graph_.lock(); + ABORT_IF(!graph, "graph in layer {} expired?", path()); + return graph; + } + + const Ptr graph() const { + auto graph = graph_.lock(); + ABORT_IF(!graph, "graph in layer {} expired?", path()); + return graph; + } + +#if 1 + // @TODO: this should be removed, currently hack to init graph. + void setGraph(Ptr graph) { + graph_ = graph; + for(auto& lr: namedLayers()) + lr.second->setGraph(graph); + } +#endif + + // Dynamic cast to requested layer type. Will return nullptr if not possible + template + Ptr as() { + return std::dynamic_pointer_cast(shared_from_this()); + } + + // Dynamic cast to requested layer type. Will return nullptr if not possible + template + Ptr as() const { + return const_cast(this)->as(); + } + + // Dynamic cast to requested layer type. Will abort if the cast is not possible. + template + Ptr cast() { + auto layerCast = as(); + ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", + className(), + utils::cxxTypeName()); + return layerCast; + } + + template + Ptr cast() const { + return const_cast(this)->cast(); + } + + // Return all named parameters for this specific layer (not descending into sub-layers) + std::vector& namedParameters() { return namedParameters_; } + const std::vector& namedParameters() const { return namedParameters_; } + + // Return all named layers for this specific layer (not descending into sub-layers) + std::vector>& namedLayers() { return namedLayers_; } + const std::vector>& namedLayers() const { return namedLayers_; } + + // Return all named sub-layers for this layer and its sub-layers (descending recursively into sub-layers). + // Can be used with layer type e.g. allNamedLayers() to return only sub-layers of this type. + // Returned layers will then have the given type and do not need to be cast anymore. + template + std::vector> allNamedLayers() { + std::vector> layers; + for(auto& namedLayer : namedLayers()) { + auto castLayer = namedLayer.second->as(); + if(castLayer) + layers.emplace_back(namedLayer.first, castLayer); + + auto subLayers = namedLayer.second->allNamedLayers(); + layers.insert(layers.end(), subLayers.begin(), subLayers.end()); + } + return layers; + } + + template + std::vector> allNamedLayers() const { + return const_cast(this)->allNamedLayers(); + } + + // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending + // recursively into sub-layers). Can be used with layer type e.g. allLayers() to return only + // sub-layers of this type. Returned layers will then have the given type and do not need to be cast anymore. + template + std::vector> allLayers() { + std::vector> layers; + for(auto namedLayer : allNamedLayers()) + layers.push_back(namedLayer.second); + return layers; + } + + template + std::vector> allLayers() const { + return const_cast(this)->allLayers(); + } + + // Used by parent layers to set the name of a sub-layer. + // @TODO: make this private and only allow friend access from layers before merging with master. + // Currently misused for top layer that has no parent layer that can set its name. + void setName(const std::string& name) { name_ = name; } + + const std::string& name() const { return name_; } + + // This sets the first parent of a sublayer (the layer a sublayer was first registered with). + // This is required to generate the correct path/name for layer parameters at saving time. + void setFirstParent(Layer* parent) { + ABORT_IF(firstParent_ != nullptr, "Parent layer has already been set"); + ABORT_IF(parent == this, "Parent layer has to be different from child"); + firstParent_ = parent; + } + + // The parent layer of a sublayer is the first layer the sublayer has been registered with. + // Subsequent calls to setFirstParent will abort if the parent is already set. + bool registered() const { + return firstParent_ != nullptr; + } + + std::string path() const { + std::vector path; + if(firstParent_) + path.push_back(firstParent_->path()); + path.push_back(name_); + return marian::utils::join(path, "->"); + } + + std::string layerInfo(bool includeChildren=false) const { + std::stringstream ss; + std::function recurse; + recurse = [&](const Layer* layer, int level) { + auto indent = utils::join(std::vector(level, " "), ""); + ss << indent << layer->name() << " : " << layer->className() << std::endl; + for(auto& pr: layer->namedParameters()) + ss << indent << " " << pr.first << " : " << pr.second->shape() << std::endl; + if(includeChildren) + for(auto& lr: layer->namedLayers()) + recurse(lr.second.get(), level + 1); + }; + recurse(this, 0); + return ss.str(); + } + + // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions + // like dropout should be run. This will not affect graph-internal gradient propagation unless somehow + // specified in a layer. + Mode getMode() const { + #if 1 + if(graph()->isInference()) { + return Mode::eval; + } else { + return Mode::train; + } + #else + return mode_; + #endif + } + + // Set mode to Mode::eval for this layer and all sub-layers. This will disable dropout and similar actions. + void setEvalMode() { + mode_ = Mode::eval; + for(auto& lr: namedLayers()) + lr.second->setEvalMode(); + } + + // Set mode to Mode::train for this layer and all sub-layers. This will enable dropout and similar actions. + void setTrainMode() { + mode_ = Mode::train; + for(auto& lr: namedLayers()) + lr.second->setTrainMode(); + } + + virtual void clear() override { + for(auto& lr : namedLayers()) + lr.second->clear(); + } +}; + +class LayerWithOptions : public Layer { +protected: + Ptr options_; + +public: + LayerWithOptions(Ptr graph, Ptr options) + : Layer(graph), options_(options) {} + + virtual ~LayerWithOptions() = default; + + template + T opt(const std::string key) const { + return options_->get(key); + } + + template + T opt(const std::string key, const T& defaultValue) const { + return options_->get(key, defaultValue); + } +}; + +/** + * Wrapper to be used exclusively inside LayerList or other similar containers. This is allows to use the apply(...) functions + * of a layer without having to cast to specific type (this is done internally based on the number of arguments). Inspired by + * boost::any_type which allows to construct containers that hold various types. + * This should allow to use any layer and iterfaces will be added here as required. + */ +class AnyLayer final : public IUnaryLayer, + public IBinaryLayer, + public ITernaryLayer, + public IQuaternaryLayer, + public INaryLayer, + public IClearable { +private: + Ptr layer_; + +protected: + // private/protected constructor, should only be created within listed classes with friendship + AnyLayer(const Ptr& layer) + : layer_(layer) {} + + friend class LayerList; + +public: + // Dynamic cast to requested layer type. Will return nullptr if not possible + template + Ptr as() const { + return std::dynamic_pointer_cast(layer_); + } + + // Dynamic cast to requested layer type. Will abort if the cast is not possible. + template + Ptr cast() const { + auto layerCast = as(); + ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", + layer_->className(), + utils::cxxTypeName()); + return layerCast; + } + + Expr apply(Expr input) const override { + return cast()->apply(input); + } + + Expr apply(Expr input1, Expr input2) const override { + return cast()->apply(input1, input2); + } + + Expr apply(Expr input1, Expr input2, Expr input3) const override { + return cast()->apply(input1, input2, input3); + } + + Expr apply(Expr input1, Expr input2, Expr input3, Expr input4) const override { + return cast()->apply(input1, input2, input3, input4); + } + + Expr apply(const std::vector& inputs) const override { + return cast()->apply(inputs); + } + + virtual void clear() override { + cast()->clear(); + } +}; + +/** + * Holds sublayers in a list and performs correct registration of sublayers. Sublayers are indexed + * and can be accessed like array elements, including iteration. + * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions. + * You have to define the execution order and information flow in code. + * + * See TransformerEncoder for an example where we hold the transformer layer stack in a LayerList, + * but define a custom apply function (due to masks being external information and shared between layers). + */ +class LayerList : public Layer { +protected: + std::vector> layers_; + + template + void recursiveAppend(Last last) { + append(last); + } + + template + void recursiveAppend(First first, Rest ...rest) { + append(first); + recursiveAppend(rest...); + } + +public: + LayerList(Ptr graph) + : Layer(graph) {} + + template + LayerList(Ptr graph, Layers ...layers) + : Layer(graph) { + recursiveAppend(layers...); + } + + virtual ~LayerList() = default; + + /** + * This inserts an already existing sublayer from this or a different container which will result in + * parameter sharing if there are parameters. + ``` + auto layers = New(graph); + layers->append(New(graph, 100)); // <- creates a new sublayer and registers it. + layers->append(layers->at(0)); // <- no new sublayer created or registered; reference the first one. + ``` + */ + void append(const Ptr& layer) { + layers_.push_back(layer); + } + + void append(const Ptr& layer) { + std::string name = fmt::format("at({})->as<{}>()", layers_.size(), layer->className()); + registerLayerWithName(layer, name); + layers_.emplace_back(new AnyLayer(layer)); // not using New<...> because of missing friendship + } + + /** + * Retrieve sublayer at index i + */ + Ptr at(size_t i) const { + return layers_[i]; + } + + auto begin() -> decltype(layers_.begin()) const { + return layers_.begin(); + } + + auto end() -> decltype(layers_.end()) const { + return layers_.end(); + } + + size_t size() const { return layers_.size(); } + + virtual void clear() override { + for(auto& layer : layers_) + layer->clear(); + } +}; + +/** + * `Sequential` is a list of layers similar to `LayerList`, but does provide a set of `apply` functions. + * These function assume that the first element in the container can be a unary, binary, ternary + * or n-ary layer, but all subsequent layers have to be unary layers as they will consume the single + * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are + * not the very first layer. + * + * `Sequential` can be used to implement typical feed forward networks: + * + ``` + using namespace marian::nn; + + auto seq = New(graph, + New(graph, 100), + New(graph), + New(graph, 0.1f), + New(graph, 100), + New(graph), + New(graph) + ); + + Expr output = seq->apply(input); + ``` + * For other application patterns use `LayerList` and implement them yourself by traversing the layers. + */ +class Sequential : public LayerList, + public IUnaryLayer, + public IBinaryLayer, + public ITernaryLayer, + public IQuaternaryLayer, + public INaryLayer { +public: + Sequential(Ptr graph) + : LayerList(graph) {} + + template + Sequential(Ptr graph, Layers ...layers) + : LayerList(graph, layers...) {} + + virtual ~Sequential() = default; + + Expr apply(Expr input) const override { + ABORT_IF(layers_.empty(), "Applying empty Sequential layer?"); + return applyTail(layers_[0]->apply(input)); + } + + Expr apply(Expr input1, Expr input2) const override { + ABORT_IF(layers_.empty(), "Applying empty Sequential layer?"); + return applyTail(layers_[0]->apply(input1, input2)); + } + + Expr apply(Expr input1, Expr input2, Expr input3) const override { + ABORT_IF(layers_.empty(), "Applying empty Sequential layer?"); + return applyTail(layers_[0]->apply(input1, input2, input3)); + } + + Expr apply(Expr input1, Expr input2, Expr input3, Expr input4) const override { + ABORT_IF(layers_.empty(), "Applying empty Sequential layer?"); + return applyTail(layers_[0]->apply(input1, input2, input3, input4)); + } + + Expr apply(const std::vector& inputs) const override { + ABORT_IF(layers_.empty(), "Applying empty Sequential layer?"); + return applyTail(layers_[0]->apply(inputs)); + } + +private: + // apply remaining layers after first layer has been applied. + Expr applyTail(Expr input) const { + Expr output = input; + for(int i = 1; i < layers_.size(); ++i) + output = layers_[i]->apply(output); + return output; + } + +}; + +} // namespace nn +} // namespace marian diff --git a/src/layers_new/neuralnet.cpp b/src/layers_new/neuralnet.cpp new file mode 100644 index 000000000..11f9ae63d --- /dev/null +++ b/src/layers_new/neuralnet.cpp @@ -0,0 +1,24 @@ +#include "layers_new/neuralnet.h" + +namespace marian { +namespace nn { + +// Factory for activation function layers from name as string. +Ptr activationLayerByName(Ptr graph, const std::string& actName) { + // @TODO: lowercase actName first? + if(actName == "relu") + return New(graph); + else if(actName == "gelu") + return New(graph); + else if(actName == "tanh") + return New(graph); + else if(actName == "sigmoid") + return New(graph); + else if(actName == "swish") + return New(graph); + else + ABORT("Unknown activation function: {}", actName); +} + +} +} diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h new file mode 100644 index 000000000..51f2ef4e3 --- /dev/null +++ b/src/layers_new/neuralnet.h @@ -0,0 +1,300 @@ +#pragma once + +#include "layers_new/interface.h" +#include "graph/node_initializers.h" + +namespace marian { +namespace nn { + +static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); } + + // @TODO: this is an odd function to be here, this should rather be handled somewhere globally? + // convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention() +static inline Expr transposedLogMask(Expr mask, int dimHeads) { + if(!mask) + return nullptr; + + // LayerAttention expects mask in a different layout + int dimBatch = mask->shape()[-3]; + int dimSrcWords = mask->shape()[-2]; + mask = reshape(mask, {dimBatch, 1, 1, dimSrcWords}); // [batch size, num heads broadcast=1, max length broadcast=1, max length] + + float maskFactor = std::max(NumericLimits(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 + auto logMask = (1 - mask) * maskFactor; + logMask = reshape(repeat(logMask, dimHeads, -3), {1, dimBatch * dimHeads, 1, dimSrcWords}); + return logMask; +} + +/** + * A generic Activation function layer. Any unary Marian operator or function accepted by + * `std::function` can be turned into an activation function like this: + ``` + auto reluLayer = New(graph, (Expr(*)(Expr))relu) + ``` + * The function pointer cast may be required to disambiguate the operator name if operators + * of the same name but with a different sets of parameters exist, otherwise it can be dropped + * or replaced with a more readable lambda function. + * + * `Activation` will also accept lambdas for more complex activations: + ``` + // a reasonably accurate approximation of GELU + auto geluApprox = New(graph, [](Expr x) { return x * sigmoid(1.702f * x); }); + ``` + */ +class Activation : public Layer, public IUnaryLayer { +private: + std::function actFn; + +public: + Activation(Ptr graph, + const std::function& actFn) + : Layer(graph), actFn(actFn) {} + + virtual ~Activation() = default; + + Expr apply(Expr x) const override { + return actFn(x); + } +}; + +// A ReLU activation function layer defined via `Activation`. +struct ReLU final : public Activation { + ReLU(Ptr graph) : Activation(graph, (Expr(*)(Expr))relu) {} +}; + +// A GELU activation function layer defined via `Activation`. +struct GELU final : public Activation { + GELU(Ptr graph) : Activation(graph, (Expr(*)(Expr))gelu) {} +}; + +// A Tanh activation function layer defined via `Activation`. +struct Tanh final : public Activation { + Tanh(Ptr graph) : Activation(graph, (Expr(*)(Expr))tanh) {} +}; + +// A Sigmoid activation function layer defined via `Activation`. +struct Sigmoid final : public Activation { + Sigmoid(Ptr graph) : Activation(graph, (Expr(*)(Expr))sigmoid) {} +}; + +// A Swish activation function layer defined via `Activation`. +struct Swish final : public Activation { + Swish(Ptr graph) : Activation(graph, (Expr(*)(Expr))swish) {} +}; + +// Factory for activation function layers from name as string. +Ptr activationLayerByName(Ptr graph, const std::string& actName); + +// Applies a linear transformation to the incoming data: y = xA^T + b +struct Linear : public Layer, public IUnaryLayer { + Expr weight; + Expr bias; + + int dimOut; + bool useBias{true}; + bool transposed{false}; + Ptr init; + + // Typical constructor that can take an initializer function + Linear(Ptr graph, + int dimOut, + bool useBias = true, + bool transposed = false, + Ptr init = inits::glorotUniform()) + : Layer(graph), dimOut(dimOut), useBias(useBias), init(init) + {} + + // Alternate constructor which takes a weight parameter that will be re-used, e.g. for tied output weights. + // Since the weights are already initialized there is no initializer. Output dimension is initialized from + // the given weight parameter. + Linear(Ptr graph, + Expr tiedWeight, + bool useBias = true, + bool transposed = false) + : Layer(graph), weight(tiedWeight), dimOut(weight->shape()[-1]), useBias(useBias), init(nullptr) + {} + + virtual ~Linear() = default; + + Expr apply(Expr x) const override { + int dimIn = x->shape()[-1]; + + // if weight is already initialized nothing happens here + if(transposed) { + registerParameterLazy(weight, Shape({ dimOut, dimIn }), init); + } else { + registerParameterLazy(weight, Shape({ dimIn, dimOut }), init); + } + + if(useBias) { + registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); + } + + if(useBias) + return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed); + else + return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed); + } +}; + +struct Dropout final : public Layer, public IUnaryLayer { + float dropoutProbabilty; + UPtr dropoutMaskShape; + + Dropout(Ptr graph, + float dropoutProbabilty, + const Shape& dropoutMaskShape) + : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(new Shape(dropoutMaskShape)) + {} + + Dropout(Ptr graph, + float dropoutProbabilty) + : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(nullptr) + {} + + Expr apply(Expr input) const override { + if(getMode() == Mode::eval) + return input; + + if(dropoutMaskShape && dropoutProbabilty > 0.f) { + return marian::dropout(input, dropoutProbabilty, *dropoutMaskShape); + } else if(dropoutProbabilty > 0.f) { + return marian::dropout(input, dropoutProbabilty, {input->shape()[-2], input->shape()[-1]}); + } else { + return input; + } + } + + virtual void clear() override {} +}; + +struct LinearReluDropout final : public Linear { + using Linear::weight; + using Linear::bias; + + using Linear::dimOut; + using Linear::useBias; + using Linear::transposed; + using Linear::init; + + float dropoutProbabilty; + UPtr dropoutMaskShape; + + // Typical constructor that can take an initializer function + LinearReluDropout(Ptr graph, + int dimOut, + float dropoutProbabilty, + bool useBias = true, + bool transposed = false, + Ptr init = inits::glorotUniform()) + : Linear(graph, dimOut, useBias, transposed, init), + dropoutProbabilty(dropoutProbabilty), + dropoutMaskShape(nullptr) {} + + LinearReluDropout(Ptr graph, + int dimOut, + float dropoutProbabilty, + const Shape& dropoutMaskShape, + bool useBias = true, + bool transposed = false, + Ptr init = inits::glorotUniform()) + : Linear(graph, dimOut, useBias, transposed, init), + dropoutProbabilty(dropoutProbabilty), + dropoutMaskShape(new Shape(dropoutMaskShape)) {} + + Expr apply(Expr x) const override { + int dimIn = x->shape()[-1]; + + // if weight is already initialized nothing happens here + if(transposed) { + registerParameterLazy(weight, Shape({ dimOut, dimIn }), init); + } else { + registerParameterLazy(weight, Shape({ dimIn, dimOut }), init); + } + + if(useBias) { + registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); + } + + // @TODO: handle relu inplace for inference etc. + Expr output; + if(useBias) + output = marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed); + else + output = marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed); + + if(getMode() == Mode::eval) + return relu(output); + + if(dropoutMaskShape && dropoutProbabilty > 0.f) { + return marian::dropoutReluInplace(output, dropoutProbabilty, *dropoutMaskShape); + } else if(dropoutProbabilty > 0.f) { + return marian::dropoutReluInplace(output, dropoutProbabilty, {output->shape()[-2], output->shape()[-1]}); + } else { + return relu(output); + } + } + + virtual void clear() override {} +}; + + +struct Norm : public Layer, public IUnaryLayer { + Norm(Ptr graph) : Layer(graph) {} + virtual ~Norm() = default; + + Expr apply(Expr x) const override = 0; +}; + +struct LayerNorm final : public Norm { + Expr weight; + Expr bias; + + float eps{1e-5f}; + bool elementwiseAffine{true}; + + LayerNorm(Ptr graph, + float eps = 1e-5f, + bool elementwiseAffine = true) + : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) + {} + + Expr apply(Expr x) const override { + int dimModel = x->shape()[-1]; + if(elementwiseAffine) { + registerParameterLazy(weight, Shape({ dimModel }), inits::ones()); + registerParameterLazy(bias, Shape({ dimModel }), inits::zeros()); + return marian::layerNorm(x, weight, bias, eps); + } else { + return marian::layerNorm(x, nullptr, nullptr, eps); + } + } + + virtual void clear() override {} +}; + +struct RMSNorm final : public Norm { + Expr weight; + + float eps{1e-5f}; + bool elementwiseAffine{true}; + + RMSNorm(Ptr graph, + float eps = 1e-5f, + bool elementwiseAffine = true) + : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) + {} + + Expr apply(Expr x) const override { + int dimModel = x->shape()[-1]; + if(elementwiseAffine) { + registerParameterLazy(weight, Shape({ dimModel }), inits::ones()); + return marian::rmsNorm(x, weight, nullptr, eps); + } else { + return marian::rmsNorm(x, nullptr, nullptr, eps); + } + } +}; + +} // namespace nn +} // namespace marian diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h new file mode 100644 index 000000000..da3ac4f94 --- /dev/null +++ b/src/layers_new/rnn.h @@ -0,0 +1,126 @@ +#pragma once + +#include "layers_new/interface.h" +#include "layers_new/neuralnet.h" + +namespace marian { +namespace nn { + +struct CellState { + Expr recurrent; +}; + +struct ICell { + virtual std::vector applyToInput(Expr input) const = 0; + virtual Expr applyToState(const std::vector& inputs, Expr mask, Ptr state) const = 0; +}; + +class SSRU final : public Layer, public ICell { +protected: + using Layer::namedLayers_; + +public: + Ptr iProj; // input projection + Ptr fProj; // forget gate projection + Ptr dropout; + + int dimState; // state dimension + + SSRU(Ptr graph, int dimState, float dropProb = 0.f) : Layer(graph), dimState(dimState) { + iProj = New(graph, dimState, /*useBias=*/false); + registerLayer(iProj); + fProj = New(graph, dimState); + registerLayer(fProj); + dropout = New(graph, dropProb, Shape({dimState})); + registerLayer(dropout); + } + + std::vector applyToInput(Expr input) const override { + int dimModel = input->shape()[-1]; + ABORT_IF(dimModel != dimState, "Model dimension {} has to match state dimension {}", dimModel, dimState); + + input = dropout->apply(input); + + Expr output = iProj->apply(input); + Expr forget = fProj->apply(input); + + return {output, forget}; + } + + Expr applyToState(const std::vector& inputs, Expr mask, Ptr state) const override { + auto prevRecurrent = state->recurrent; + auto input = inputs[0]; + auto forget = inputs[1]; + + auto nextRecurrent = highway(/*input1=*/prevRecurrent, /*input2=*/input, /*gate=*/forget); // rename to "gate"? + auto nextOutput = relu(nextRecurrent); + + // @TODO: not needed? nextRecurrent = mask ? mask * nextRecurrent : nextRecurrent; + state->recurrent = nextRecurrent; + + nextOutput = mask ? mask * nextOutput : nextOutput; + return nextOutput; + } +}; + +template +class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer { +protected: + using Layer::namedLayers_; + +public: + Ptr cell; + Ptr oProj; + + RNN(Ptr graph, int dimState, bool outputProjection = false) + : Layer(graph) { + cell = New(graph, dimState); + registerLayer(cell); + + if(outputProjection) { + oProj = New(graph, dimState); + registerLayer(oProj); + } + } + + virtual Expr apply(Expr input, Expr inputMask = nullptr) const override { + auto state = New(graph()->constant({1, 1, 1, cell->dimState}, inits::zeros()), /*position=*/0); + return apply(input, inputMask, state); + } + + virtual Expr apply(Expr input, Expr inputMask, Ptr state) const override { + auto cellState = New(); + cellState->recurrent = state->as()->get(); + + input = swapTimeBatch(input); // [beam, time, batch, dim] + if(inputMask) + inputMask = swapTimeBatch(inputMask); + int dimTimeAxis = -3; + + std::vector inputs = cell->applyToInput(input); + + std::vector outputs; + for(int i = 0; i < input->shape()[dimTimeAxis]; ++i) { + std::vector stepInputs(inputs.size()); + std::transform(inputs.begin(), inputs.end(), stepInputs.begin(), + [i, dimTimeAxis](Expr e) { return slice(e, dimTimeAxis, i); }); + auto stepMask = inputMask; + if(stepMask) + stepMask = slice(inputMask, dimTimeAxis, i); + + Expr output = cell->applyToState(stepInputs, stepMask, /*in/out=*/cellState); + outputs.push_back(output); + } + + state->as()->set(cellState->recurrent); + + Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis)); + if(oProj) + output = oProj->apply(output); + + return output; + } +}; + +} +} \ No newline at end of file diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h new file mode 100644 index 000000000..3302d9d85 --- /dev/null +++ b/src/layers_new/transformer.h @@ -0,0 +1,553 @@ +#pragma once + +#include "layers_new/attention.h" +#include "layers_new/decoder.h" +#include "layers_new/embeddings.h" +#include "layers_new/neuralnet.h" +#include "layers_new/rnn.h" + +#include + +namespace marian { +namespace nn { + +/** + * This groups the typical transformer pre/post-processing steps in to a class. + * Currently these are usually dropout, layer normalization and skip connections. + * A transformer block will usually apply one of them. + */ +struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { + Ptr dropout; + Ptr norm; + std::string actionDesc; + + TransformerPrePostProcessor(Ptr graph, + const std::string& actionDesc, + float dropoutProbablity) + : Layer(graph), + actionDesc(actionDesc) + { + for(char a : actionDesc) { + if(a == 'd') { + ABORT_IF(dropout, "Dropout layer already initialized? Did you specify 'd' more than once?"); + dropout = New(graph, dropoutProbablity); + registerLayer(dropout); + } else if(a == 'n') { + ABORT_IF(norm, "Norm layer already initialized? Did you specify 'n' or 'r' more than once?"); + norm = New(graph); + registerLayer(norm); + } else if(a == 'r') { + ABORT_IF(norm, "Norm layer already initialized? Did you specify 'n' or 'r' more than once?"); + norm = New(graph); + registerLayer(norm); + } + } + } + + Expr apply(Expr input, Expr previous = nullptr) const override { + Expr output = input; + for(char action : actionDesc) { + if(action == 'd') + output = dropout->apply(output); + else if(action == 'a' && previous) + output = output + previous; + else if(action == 'a' && !previous) + ABORT("Action 'a' (add skip connection) specified but no previous input given"); + else if(action == 'n' || action == 'r') + output = norm->apply(output); + else + ABORT("Action '{}' in '{}' unknown", action, actionDesc); + } + return output; + } +}; + +/** + * This is a typical transformer self-attention block. The default configuration will + * use a multi-head multiplicative self-attention layer, followed by dropout, the skip + * connection and layer normalization (dan) in the post-processor. The pre-processor does + * nothing in the default configuration. + */ +class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBinaryLayer { +public: + Ptr preprocessor; + Ptr selfAttention; + Ptr postprocessor; + + TransformerSelfAttentionBlock(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + preprocessor = New( + graph, + opt("transformer-preprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + // @TODO: factory to support different attention flavors? + selfAttention = attentionFromOptions(graph, options); + registerLayer(selfAttention); + + postprocessor = New( + graph, + opt("transformer-postprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input, Expr mask = nullptr) const override { + auto output = preprocessor->apply(input); // optional preprocessing + output = selfAttention->apply(output, output, output, mask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + return output; + } +}; + +/** + * This is a typical transformer filter (1-dimensional convolution) block. The default configuration will + * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout, + * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does + * nothing in the default configuration. + */ +struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer { + Ptr preprocessor; + Ptr layers; + Ptr postprocessor; + bool isDecoder{false}; + + TransformerFilterBlock(Ptr graph, + Ptr options, + bool isDecoder = false) + : LayerWithOptions(graph, options), isDecoder(isDecoder) + { + preprocessor = New( + graph, + opt("transformer-preprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + int modelDim = opt("dim-emb"); + int ffnDim = opt("transformer-dim-ffn"); + if(isDecoder && opt("transformer-decoder-dim-ffn") != 0) + ffnDim = opt("transformer-decoder-dim-ffn"); + + int depth = opt("transformer-ffn-depth", 2); + if(isDecoder && opt("transformer-decoder-ffn-depth") != 0) + depth = opt("transformer-decoder-ffn-depth"); + + auto actName = opt("transformer-ffn-activation", "relu"); + float ffnDropoutProbability = opt("transformer-dropout-ffn", 0.f); + + ABORT_IF(depth < 1, "Filter depth {} is smaller than 1", depth); + + // assemble filter of given depth + layers = New(graph); + registerLayer(layers); + + if(actName == "relu") { + layers->append(New(graph, ffnDim, ffnDropoutProbability)); + } else { + layers->append(New(graph, ffnDim)); + layers->append(activationLayerByName(graph, actName)); + layers->append(New(graph, ffnDropoutProbability)); + } + for(int i = 1; i < depth-1; ++i) { + if(actName == "relu") { + layers->append(New(graph, ffnDim, ffnDropoutProbability)); + } else { + layers->append(New(graph, ffnDim)); + layers->append(activationLayerByName(graph, actName)); + layers->append(New(graph, ffnDropoutProbability)); + } + } + layers->append(New(graph, modelDim)); + + postprocessor = New( + graph, + opt("transformer-postprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input) const override { + Expr output = preprocessor->apply(input); // optional preprocessing + output = layers->apply(output); // main FFN + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + return output; + } +}; + +/** + * A full transformer encoder layer consists of a self-attention block followed by + * a filter block. Skip connections etc. are handled inside the blocks, see above. + */ +struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer { + Ptr selfAttentionBlock; + Ptr filterBlock; + + TransformerEncoderLayer(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + selfAttentionBlock = New(graph, options); + registerLayer(selfAttentionBlock); + + filterBlock = New(graph, options); + registerLayer(filterBlock); + } + + Expr apply(Expr input, Expr mask = nullptr) const override { + Expr output = selfAttentionBlock->apply(input, mask); + output = filterBlock->apply(output); + + checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) + + return output; + } +}; + +/** + * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we + * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity + * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. + * @TODO: get rid of these transposes. + */ +struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer { + Ptr positionEmbedding; + Ptr preprocessor; + Ptr layers; + Ptr postprocessor; + + TransformerEncoder(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); + registerLayer(positionEmbedding); + + preprocessor = New( + graph, + opt("transformer-postprocess-emb", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + layers = New(graph); + registerLayer(layers); + for(int i = 0; i < opt("enc-depth"); ++i) { + auto transformerEncoderLayer = New(graph, options); + // example of changing linear layer init functions burried deep in the model + if(opt("transformer-depth-scaling", false)) + for(auto linear : transformerEncoderLayer->allLayers()) + linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + + layers->append(transformerEncoderLayer); + } + + postprocessor = New( + graph, + opt("transformer-postprocess-top", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input, Expr mask = nullptr) const override { + // first and last operations (see at the bottom of this function) switch the time and batch + // dimensions. This order is more natural for the transformer, but more difficult to handle + // during beam search or when using RNNs. Hence the input/output transpositions here. + + // @TODO: still worth to review this whole transpose business across the tool. In the + // decoder state, Frank added information about batchMajor/timeMajor orientation. If we + // do that everywhere we can detect inconsistencies automatically. + // reorganize batch and timestep + auto output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] + if(mask) { + mask = swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] + mask = transposedLogMask(mask, opt("transformer-heads")); + } + + // apply positional embeddings to contextual input + output = positionEmbedding->apply(output); + + // handle for skip connection at top + auto prevOutput = output; + + // apply dropout or layer-norm to embeddings if required + output = preprocessor->apply(output); + + // traverse the layers, use the same mask for each + for(auto layer : *layers) + output = layer->apply(output, mask); + + // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection + output = postprocessor->apply(output, prevOutput); + + // restore organization of batch and time steps. This is currently required + // to make RNN-based decoders and beam search work with this. We are looking + // into making this more natural. + + // @TODO: it might be worth to make this optional when the input goes into a + // transformer decoder which now has to undo that again -- or even better + // detect idempotent transposes during a process similar to auto-batching. + // Or as other toolkits do it, make the transformer order the default and only transpose for RNNs. + output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim] + return output; + } +}; + +/** + * This is a typical transformer cross-attention block. The default configuration will + * use a multi-head multiplicative cross-attention layer, followed by dropout, the skip + * connection and layer normalization (dan) in the post-processor. The pre-processor does + * nothing in the default configuration. + */ +class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITernaryLayer { +public: + Ptr preprocessor; + Ptr crossAttention; + Ptr postprocessor; + + TransformerCrossAttentionBlock(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + preprocessor = New( + graph, + opt("transformer-preprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + // @TODO: factory to support different attention flavors? + crossAttention = attentionFromOptions(graph, options); + registerLayer(crossAttention); + + postprocessor = New( + graph, + opt("transformer-postprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input, Expr context, Expr contextMask = nullptr) const override { + auto output = preprocessor->apply(input); // optional preprocessing + output = crossAttention->apply(output, context, context, contextMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + return output; + } +}; + +#if 1 + +class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer { +public: + TransformerAutoRegressiveBlock(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) {} + + virtual ~TransformerAutoRegressiveBlock() = default; + + using IBinaryDecoderLayer::apply; +}; + +/** + * This is a transformer RNN block. + */ +class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { +public: + Ptr preprocessor; + Ptr> rnn; + Ptr postprocessor; + + TransformerRNNBlock(Ptr graph, + Ptr options) + : TransformerAutoRegressiveBlock(graph, options) + { + preprocessor = New( + graph, + opt("transformer-preprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + // @TODO: factory to support different attention flavors? + rnn = New>(graph, opt("dim-emb"), opt("transformer-rnn-projection", false)); + registerLayer(rnn); + + postprocessor = New( + graph, + opt("transformer-postprocess", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input, Expr inputMask, Ptr state) const override { + auto output = preprocessor->apply(input); // optional preprocessing + output = rnn->apply(output, inputMask, state); // rnn application with state extension + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + return output; + } +}; + +/** + * A full transformer decoder layer consists of a self-attention block followed by + * cross-attention block and a filter block. Skip connections etc. are handled inside + * the blocks, see above. + * + * For the self-attention block we need a special mask, usually a triangle mask that + * prohibits to look into the future. + * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive + * for many layers. + */ +struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer { + Ptr autoRegressiveBlock; + Ptr crossAttentionBlock; + Ptr filterBlock; + + TransformerDecoderLayer(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + auto autoRegressionType = opt("transformer-decoder-autoreg", "self-attention"); + if(autoRegressionType == "self-attention") { + ABORT("Auto-regression block type {} not yet implemented", autoRegressionType); + } else if(autoRegressionType == "rnn") { + autoRegressiveBlock = New(graph, options); + } else { + ABORT("Unknown auto-regression block type {}", autoRegressionType); + } + registerLayer(autoRegressiveBlock); + + crossAttentionBlock = New(graph, options); + registerLayer(crossAttentionBlock); + + filterBlock = New(graph, options, /*isDecoder=*/true); + registerLayer(filterBlock); + } + + Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr state) const override { + Expr output = autoRegressiveBlock->apply(input, inputMask, state); + output = crossAttentionBlock->apply(output, context, contextMask); + output = filterBlock->apply(output); + + checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) + return output; + } +}; + +/** + * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we + * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity + * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. + * @TODO: get rid of these transposes. + */ +struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer { + Ptr positionEmbedding; + Ptr preprocessor; + Ptr layers; + Ptr postprocessor; + + TransformerDecoder(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options) + { + positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); + registerLayer(positionEmbedding); + + preprocessor = New( + graph, + opt("transformer-postprocess-emb", ""), + opt("transformer-dropout", 0.f)); + registerLayer(preprocessor); + + size_t decDepth = opt("dec-depth"); + std::vector tiedLayers = opt>("transformer-tied-layers", std::vector()); + ABORT_IF(!tiedLayers.empty() && tiedLayers.size() != decDepth, + "Specified layer tying for {} layers, but decoder has {} layers", + tiedLayers.size(), + decDepth); + // shift to base-0 indexing + for(auto& layerNo : tiedLayers) + layerNo = layerNo - 1; + + layers = New(graph); + registerLayer(layers); + for(size_t i = 0; i < decDepth; ++i) { + if(tiedLayers.empty() || tiedLayers[i] == i) { // not tied or tied to itself, so needs to be created first + auto transformerDecoderLayer = New(graph, options); + layers->append(transformerDecoderLayer); + } else { + ABORT_IF(tiedLayers[i] > i, "Cannot tie to layer above this layer??"); + layers->append(layers->at(tiedLayers[i])); // repeat layer to tie weights + } + + auto currentLayer = layers->at(i)->as(); + // example of changing linear layer init functions burried deep in the model + if(opt("transformer-depth-scaling", false)) { + auto autoRegLayer = currentLayer->autoRegressiveBlock->as(); + autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + + for(auto linear : currentLayer->crossAttentionBlock->allLayers()) + linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + for(auto linear : currentLayer->filterBlock->allLayers()) + linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + + } + } + + postprocessor = New( + graph, + opt("transformer-postprocess-top", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + } + + Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr state) const override { + // first and last operations (see at the bottom of this function) switch the time and batch + // dimensions. This order is more natural for the transformer, but more difficult to handle + // during beam search or when using RNNs. Hence the input/output transpositions here. + Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] + context = swapTimeBatch(context); + + // @TODO: write function prepareMasks(); + // @TODO: create triangle mask here and combine with inputMask + LOG_ONCE(info, "Don't forget the triangle mask if required!"); + if(inputMask) { + inputMask = swapTimeBatch(inputMask); // [beam depth=1, batch size, max length, vector dim=1] + } + + if(contextMask) { + contextMask = swapTimeBatch(contextMask); // [beam depth=1, max length, batch size, vector dim=1] + contextMask = transposedLogMask(contextMask, opt("transformer-heads")); // [beam broadcast=1, batch size * num heads, max length broadcast=1, max length] + } + + // apply positional embeddings to contextual input @TODO: remove need for conversion to int + output = positionEmbedding->apply(output, (int)state->getPosition()); + + // handle for skip connection at top + auto prevOutput = output; + + // apply dropout or layer-norm to embeddings if required + output = preprocessor->apply(output); + + // get an iterator to per-layer states + auto layerStateIt = state->as()->begin(); + // traverse the layers, use the same mask for each + for(auto layer : *layers) + output = layer->as()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++); + + // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection + output = postprocessor->apply(output, prevOutput); + + // restore organization of batch and time steps. This is currently required + // to make RNN-based decoders and beam search work with this. We are looking + // into making this more natural. + // @TODO: it might be worth to make this optional when the input goes into a + // transformer decoder which now has to undo that again -- or even better + // detect idempotent transposes during a process similar to auto-batching. + // Or as other toolkits do it, make the transformer order the default and only transpose for RNNs. + output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim] + return output; + } +}; +#endif + +} // namespace nn +} // namespace marian diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 5a317019d..17ee2a4d9 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -12,6 +12,7 @@ #include "models/s2s.h" #include "models/laser.h" #include "models/transformer_factory.h" +#include "models/transformer_new.h" #ifdef CUDNN #include "models/char_s2s.h" @@ -183,20 +184,43 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti .construct(graph); } - else if(type == "transformer") { -#if 1 + else if(type == "transformer-new") { auto newOptions = options->with("usage", use); auto res = New(graph, newOptions); - res->push_back(New(graph, newOptions->with("type", "transformer"))); - res->push_back(New(graph, newOptions->with("type", "transformer"))); + + auto enc = New(graph, newOptions->with("type", "transformer")); + enc->setName("TransformerBatchEncoder"); + res->push_back(enc); + + auto dec = New(graph, newOptions->with("type", "transformer")); + dec->setName("TransformerBatchDecoder"); + res->push_back(dec); + return res; -#else - return models::encoder_decoder(options->with( - "usage", use)) - .push_back(models::encoder()("type", "transformer")) - .push_back(models::decoder()("type", "transformer")) - .construct(graph); -#endif + } + + else if(type == "transformer") { + const char* tflavor = std::getenv("TRANSFORMER_FLAVOR"); + if(tflavor && std::strcmp(tflavor, "experimental") == 0) { + auto newOptions = options->with("usage", use); + auto res = New(graph, newOptions); + + auto enc = New(graph, newOptions->with("type", "transformer")); + enc->setName("TransformerBatchEncoder"); + res->push_back(enc); + + auto dec = New(graph, newOptions->with("type", "transformer")); + dec->setName("TransformerBatchDecoder"); + res->push_back(dec); + + return res; + } else { + auto newOptions = options->with("usage", use); + auto res = New(graph, newOptions); + res->push_back(New(graph, newOptions->with("type", "transformer"))); + res->push_back(New(graph, newOptions->with("type", "transformer"))); + return res; + } } else if(type == "transformer_s2s") { diff --git a/src/models/s2s.h b/src/models/s2s.h index 8eb2ef8d1..cfab3fcae 100644 --- a/src/models/s2s.h +++ b/src/models/s2s.h @@ -246,7 +246,7 @@ class DecoderS2S : public DecoderBase { } rnn::States startStates(opt("dec-depth"), {start, start}); - return New(startStates, Logits(), encStates, batch); + return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); } virtual Ptr step(Ptr graph, @@ -341,8 +341,7 @@ class DecoderS2S : public DecoderBase { logits = output_->applyAsLogits({embeddings, decoderContext}); // return unormalized(!) probabilities - auto nextState = New( - decoderStates, logits, state->getEncoderStates(), state->getBatch()); + auto nextState = New(decoderStates, logits, state->getEncoderStates(), state->getBatch(), /*isBatchMajor=*/false); // Advance current target token position by one nextState->setPosition(state->getPosition() + 1); @@ -351,8 +350,7 @@ class DecoderS2S : public DecoderBase { // helper function for guided alignment virtual const std::vector getAlignments(int i = 0) override { - auto att - = rnn_->at(0)->as()->at(i + 1)->as(); + auto att = rnn_->at(0)->as()->at(i + 1)->as(); return att->getAlignments(); } diff --git a/src/models/states.h b/src/models/states.h index 20dd59c95..a4be3795e 100644 --- a/src/models/states.h +++ b/src/models/states.h @@ -21,19 +21,16 @@ class EncoderState { virtual Expr getContext() const { return context_; } virtual Expr getAttended() const { return context_; } - virtual Expr getMask() const { - return mask_; - } // source batch mask; may have additional positions suppressed - + virtual Expr getMask() const { return mask_; } + + // source batch mask; may have additional positions suppressed virtual const Words& getSourceWords() { return batch_->front()->data(); } // Sub-select active batch entries from encoder context and context mask - Ptr select( - const std::vector& batchIndices) { // [batchIndex] indices of active batch entries + Ptr select(const std::vector& batchIndices) { // [batchIndex] indices of active batch entries // Dimension -2 is OK for both, RNN and Transformer models as the encoder context in Transformer // gets transposed to the same dimension layout - return New( - index_select(context_, -2, batchIndices), index_select(mask_, -2, batchIndices), batch_); + return New(index_select(context_, -2, batchIndices), index_select(mask_, -2, batchIndices), batch_); } }; @@ -43,6 +40,7 @@ class DecoderState { Logits logProbs_; std::vector> encStates_; Ptr batch_; + bool isBatchMajor_{false}; Expr targetHistoryEmbeddings_; // decoder history (teacher-forced or from decoding), embedded Expr targetMask_; @@ -55,8 +53,9 @@ class DecoderState { DecoderState(const rnn::States& states, Logits logProbs, const std::vector>& encStates, - Ptr batch) - : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {} + Ptr batch, + bool isBatchMajor = false) + : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch), isBatchMajor_(isBatchMajor) {} virtual ~DecoderState() {} // @TODO: Do we need all these to be virtual? @@ -64,9 +63,9 @@ class DecoderState { virtual Logits getLogProbs() const { return logProbs_; } virtual void setLogProbs(Logits logProbs) { logProbs_ = logProbs; } + virtual bool isBatchMajor() { return isBatchMajor_; } - // @TODO: should this be a constructor? Then derived classes can call this without the New<> in - // the loop + // @TODO: should this be a constructor? Then derived classes can call this without the New<> in the loop virtual Ptr select( const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] const std::vector& batchIndices, // [batchIndex] @@ -75,15 +74,14 @@ class DecoderState { for(auto& es : encStates_) // If the size of the batch dimension of the encoder state context changed, subselect the // correct batch entries - newEncStates.push_back( - es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); + newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices - auto selectedState - = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/false), - logProbs_, - newEncStates, - batch_); + auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_), + logProbs_, + newEncStates, + batch_, + isBatchMajor_); // Set positon of new state based on the target token position of current state selectedState->setPosition(getPosition()); diff --git a/src/models/transformer.h b/src/models/transformer.h index 1fed868b6..a3f6d9b53 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -285,6 +285,7 @@ class Transformer : public EncoderOrDecoderBase { auto Wq = graph_->param(prefix + "_Wq", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bq = graph_->param(prefix + "_bq", { 1, dimModel}, inits::zeros()); auto qh = affine(q, Wq, bq); + qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] Expr kh; @@ -633,35 +634,6 @@ class EncoderTransformer : public Transformer { virtual void clear() override {} }; -class TransformerState : public DecoderState { -public: - TransformerState(const rnn::States& states, - Logits logProbs, - const std::vector>& encStates, - Ptr batch) - : DecoderState(states, logProbs, encStates, batch) {} - - virtual Ptr select(const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] - const std::vector& batchIndices, // [batchIndex] - int beamSize) const override { - - // @TODO: code duplication with DecoderState only because of isBatchMajor=true, should rather be a contructor argument of DecoderState? - - std::vector> newEncStates; - for(auto& es : encStates_) - // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries - newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); - - // Create hypothesis-selected state based on current state and hyp indices - auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/true), logProbs_, newEncStates, batch_); - - // Set the same target token position as the current state - // @TODO: This is the same as in base function. - selectedState->setPosition(getPosition()); - return selectedState; - } -}; - class DecoderTransformer : public Transformer { typedef Transformer Base; using Base::Base; @@ -718,12 +690,11 @@ class DecoderTransformer : public Transformer { start->set_name("decoder_start_state_" + std::to_string(batchIndex_)); rnn::States startStates(opt("dec-depth"), {start, start}); - // don't use TransformerState for RNN layers - return New(startStates, Logits(), encStates, batch); + return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); } else { rnn::States startStates; - return New(startStates, Logits(), encStates, batch); + return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true); } } @@ -825,7 +796,7 @@ class DecoderTransformer : public Transformer { rnn::State prevDecoderState; if(prevDecoderStates.size() > 0) prevDecoderState = prevDecoderStates[i]; - + // self-attention std::string layerType = opt("transformer-decoder-autoreg", "self-attention"); rnn::State decoderState; @@ -903,7 +874,6 @@ class DecoderTransformer : public Transformer { auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] //************************************************************************// - // final feed-forward layer (output) if(shortlist_) output_->setShortlist(shortlist_); @@ -912,11 +882,9 @@ class DecoderTransformer : public Transformer { // return unormalized(!) probabilities Ptr nextState; if (opt("transformer-decoder-autoreg", "self-attention") == "rnn") { - nextState = New( - decoderStates, logits, state->getEncoderStates(), state->getBatch()); + nextState = New(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor()); } else { - nextState = New( - decoderStates, logits, state->getEncoderStates(), state->getBatch()); + nextState = New(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor()); } nextState->setPosition(state->getPosition() + 1); return nextState; diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h index b282d819c..46df741b0 100644 --- a/src/models/transformer_factory.h +++ b/src/models/transformer_factory.h @@ -3,10 +3,172 @@ #include "marian.h" +#include "layers_new/neuralnet.h" #include "models/decoder.h" #include "models/encoder.h" +#include "models/encoder_decoder.h" namespace marian { Ptr NewEncoderTransformer(Ptr graph, Ptr options); Ptr NewDecoderTransformer(Ptr graph, Ptr options); + +class TransformerLegacy : public EncoderDecoder { +public: + TransformerLegacy(Ptr graph, Ptr options) + : EncoderDecoder(graph, options), nameMap_(createNameMap()) { } + + void load(Ptr graph, + const std::vector& items, + bool markedReloaded = true) override { + + for(auto it = items.begin(); it != items.end(); it++) { + auto pair = nameMap_.find(it->name); + if(pair != nameMap_.end()) { + LOG(debug, "Mapping parameter {} to {}", it->name, pair->second); + const_cast(*it).name = pair->second; + + // reduce shape of bias vectors from {1, dimModel} to {dimModel} + int dimModel = it->shape[-1]; + if(it->shape == Shape({1, dimModel})) + const_cast(*it).shape = Shape({dimModel}); + } else { + LOG(debug, "Could not find parameter {}", it->name); + } + } + + // in the new model, linear layers are transposed; we undo that here. + // @TODO: alternatively, we can transpose the item data + auto encoder = std::dynamic_pointer_cast(encoders_[0]); + ABORT_IF(!encoder, "Could not cast to new type of encoder??"); + for(auto& linear : encoder->allLayers()) + linear->transposed = false; + + auto decoder = std::dynamic_pointer_cast(decoders_[0]); + ABORT_IF(!decoder, "Could not cast to new type of decoder??"); + for(auto& linear : decoder->allLayers()) + linear->transposed = false; + + // load items into the graph + graph->load(items); + } + + void load(Ptr graph, + const std::string& name, + bool markReloaded = true) override { + LOG(info, "Loading model from {}", name); + auto items = io::loadItems(name); + load(graph, items, markReloaded); + } + +private: + std::map nameMap_; + + std::map createNameMap() { + std::map nameMap = { + {"Wemb", "Wemb"}, + }; + + // @TODO: This is going to change + std::string prefix = "TransformerBatchEncoder"; + + std::string key, value; + for(int layerNo = 0; layerNo < opt("enc-depth"); ++layerNo) { + // name maps for encoder self-attention blocks + nameMap[fmt::format("encoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo); + + nameMap[fmt::format("encoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo); + + nameMap[fmt::format("encoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo); + + nameMap[fmt::format("encoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo); + + nameMap[fmt::format("encoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo); + + // name maps for encoder FFN blocks + int mult = 3; + for(int ffnLayerNo = 0; ffnLayerNo < opt("transformer-ffn-depth"); ++ffnLayerNo) { + std::string layerType = "Linear"; + // multiplying with 3 since in new model activation and dropout are also layers that are always added + if(opt("transformer-ffn-activation") == "relu" && ffnLayerNo < opt("transformer-ffn-depth") - 1) { + mult = 1; + layerType = "LinearReluDropout"; + } + nameMap[fmt::format("encoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->filterBlock->layers->at({})->as()->weight", prefix, layerNo, mult * ffnLayerNo, layerType); + nameMap[fmt::format("encoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->filterBlock->layers->at({})->as()->bias", prefix, layerNo, mult * ffnLayerNo, layerType); + } + nameMap[fmt::format("encoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->filterBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("encoder_l{}_ffn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->encoder->layers->at({})->as()->filterBlock->postprocessor->norm->bias", prefix, layerNo); + } + + prefix = "TransformerBatchDecoder"; + for(int layerNo = 0; layerNo < opt("dec-depth"); ++layerNo) { + // name maps for decoder self-attention blocks + nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo); + + // name maps for decoder SSRU + nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); + + // name maps for decoder cross-attention blocks + nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo); + + nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo); + + // name maps for decoder FFN blocks + int mult = 3; + for(int ffnLayerNo = 0; ffnLayerNo < opt("transformer-ffn-depth"); ++ffnLayerNo) { + std::string layerType = "Linear"; + // multiplying with 3 since in new model activation and dropout are also layers that are always added + if(opt("transformer-ffn-activation") == "relu" && ffnLayerNo < opt("transformer-ffn-depth") - 1) { + mult = 1; + layerType = "LinearReluDropout"; + } + nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->weight", prefix, layerNo, mult * ffnLayerNo, layerType); + nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->bias", prefix, layerNo, mult * ffnLayerNo, layerType); + } + nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->bias", prefix, layerNo); + } + + return nameMap; + } +}; + } // namespace marian diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h new file mode 100644 index 000000000..cfc3a6b14 --- /dev/null +++ b/src/models/transformer_new.h @@ -0,0 +1,245 @@ +#pragma once + +#include "layers_new/transformer.h" + +#include "models/encoder.h" +#include "models/decoder.h" +#include "models/states.h" +#include "layers/constructors.h" + +namespace marian { + +// Wrapper for backwards compatibility that uses current encoder/decoder framework +struct TransformerBatchEncoder : public nn::LayerWithOptions, + public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings + public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? + Ptr encoder; + + TransformerBatchEncoder(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options), + EncoderBase(graph, options) + { + encoder = New(graph, options); + registerLayer(encoder); + } + + // @TODO: subBatch should be of type Expr + virtual std::tuple apply(Ptr subBatch) const override { + // @TODO: this is still using the bad old interface + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + const auto& [batchEmbedding, batchMask] = embeddingLayer->apply(subBatch); + auto batchContext = encoder->apply(batchEmbedding, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return std::make_tuple(batchContext, batchMask); + } + + virtual Expr apply(const Words& words, const Shape& shape) const override final { + return applyIndices(toWordIndexVector(words), shape); + } + + // alternative from indices directly + virtual Expr applyIndices(const std::vector& wordIndices, const Shape& shape) const override final { + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape); + auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return batchContext; + } + + // @TODO: currently here for backwards compat, should be replaced with apply() + virtual Ptr build(Ptr graph, + Ptr batch) override { +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + EncoderBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); + return New(batchEmbedding, batchMask, batch); + } + + virtual void clear() override { + Layer::clear(); + } +}; + +// Wrapper for backwards compatibility that uses current encoder/decoder framework +class TransformerBatchDecoder : public nn::LayerWithOptions, + public DecoderBase { + + Ptr decoder; + Ptr output_; + + void lazyCreateOutputLayer() + { + using db = DecoderBase; + + if(output_) // create it lazily + return; + + int dimTrgVoc = db::opt>("dim-vocabs")[batchIndex_]; + + auto outputFactory = mlp::OutputFactory( + "prefix", prefix_ + "_ff_logit_out", + "dim", dimTrgVoc, + "vocab", db::opt>("vocabs")[batchIndex_], // for factored outputs + "output-omit-bias", db::opt("output-omit-bias", false), + "output-approx-knn", db::opt>("output-approx-knn", {}), + "lemma-dim-emb", db::opt("lemma-dim-emb", 0), + "lemma-dependency", db::opt("lemma-dependency", ""), // for factored outputs + "factors-combine", db::opt("factors-combine", "")); // for factored outputs + + if(db::opt("tied-embeddings") || db::opt("tied-embeddings-all")) + outputFactory.tieTransposed(db::opt("tied-embeddings-all") || db::opt("tied-embeddings-src") ? "Wemb" : prefix_ + "_Wemb"); + + output_ = std::dynamic_pointer_cast(outputFactory.construct(graph())); // (construct() returns only the underlying interface) + } + +public: + TransformerBatchDecoder(Ptr graph, Ptr options) + : LayerWithOptions(graph, options), DecoderBase(graph, options) { + + decoder = New(graph, options); + registerLayer(decoder); + + } + + virtual Ptr startState(Ptr graph, + Ptr batch, + std::vector>& encStates) override { + +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + DecoderBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + std::string layerType = DecoderBase::opt("transformer-decoder-autoreg", "self-attention"); + if (layerType == "rnn") { + int dimBatch = (int)batch->size(); + int dim = DecoderBase::opt("dim-emb"); + + auto start = graph->constant({1, 1, dimBatch, dim}, inits::zeros()); + rnn::States startStates(DecoderBase::opt("dec-depth"), {start, start}); + + // don't use TransformerState for RNN layers + return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); + } + else { + rnn::States startStates; + return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true); + } + } + + virtual Ptr step(Ptr graph, + Ptr state) override { +#if 1 // Sanity check for as long as we mix legacy code and new code + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + lazyCreateOutputLayer(); + return step(state); + } + + Ptr step(Ptr state) { + auto embeddings = state->getTargetHistoryEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] + auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis + + //************************************************************************// + + auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output + auto encoderMask = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention + + // Convert old style decoder state to new decoder state + size_t position = state->getPosition(); + auto nnState = New(position); + for(auto& layerState : state->getStates()) + nnState->as()->append(New(layerState.cell, position)); + + auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState); + + // final feed-forward layer (output) + if(shortlist_) + output_->setShortlist(shortlist_); + auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] + + // Convert new style decoder state to old decoder state + // @TODO: This is such a mess! + rnn::States decoderStates; + for(auto layerState : *nnState->as()) { + auto cellState = layerState->as()->get(); + decoderStates.push_back(rnn::State({ cellState, cellState })); + } + // return unnormalized(!) probabilities + auto nextState = New(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor()); + nextState->setPosition(state->getPosition() + 1); + + return nextState; + } + + // helper function for guided alignment + // @TODO: const vector<> seems wrong. Either make it non-const or a const& (more efficient but dangerous) + virtual const std::vector getAlignments(int /*i*/ = 0) override { + ABORT("Not implemented"); + return {}; + } + + virtual void clear() override { + Layer::clear(); + if (output_) + output_->clear(); + } +}; + +} // namespace marian + +#if 0 // ignore me. To-be-removed once fully functional. + +static void testme() { + using namespace marian; + using namespace nn; + + auto options = New( + "enc-depth", 12, + "transformer-heads", 8, + "dim-emb", 512, + "transformer-ffn-depth", 2, + "transformer-dim-ffn", 2048, + "transformer-dropout", 0.1, + "transformer-dropout-attention", 0.0, + "transformer-postprocess", "dan", + "transformer-ffn-activation", "relu", + "transformer-train-position-embeddings", false, + "transformer-depth-scaling", true, + "max-length", 256); + + Config::seed = 1234; + + auto graph = New(/*inference=*/true); + graph->setDevice(CPU0); + graph->reserveWorkspaceMB(1000); + + auto input = graph->constant({10, 1, 512}, inits::glorotUniform()); // [length, batch, dim] + auto mask = graph->constant({10, 1, 1}, inits::ones()); // [length, batch, 1] + + auto encoder = New(graph, options); + encoder->setName("TransformerEncoder"); + encoder->setEvalMode(); + + auto context = encoder->apply(input, mask); + + std::cerr << encoder->layerInfo(/*includeChildren=*/true) << std::endl; + + debug(context); + + graph->forward(); + graph->save("test.npz"); +} + +#endif diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1e1adc38b..5be3eee26 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -388,12 +388,13 @@ void TransposeGeneric(Tensor out, Tensor in, const std::vector& vAxis) { } void TransposeND(Tensor out, Tensor in, const std::vector& vAxis) { - if(vAxis == std::vector({0, 2, 1, 3})) - Transpose0213(out, in); #if MKL_FOUND - else if(vAxis.size() == 4 && vAxis[3] == 3) + if(vAxis.size() == 4 && vAxis[3] == 3) TransposeFirst3In4(out, in, vAxis); + else #endif // MKL_FOUND + if(vAxis == std::vector({0, 2, 1, 3})) + Transpose0213(out, in); else if(vAxis == std::vector({1, 0}) && in->shape()[-1] % 16 == 0 && in->shape()[-2] % 16 == 0) Transpose10(out, in); diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index ccf8cc72d..0a6c047cd 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -10,6 +10,7 @@ if(NOT MSVC) prod cli pooling + # transformer_new ) foreach(test ${APP_TESTS}) diff --git a/src/tests/transformer_new.cpp b/src/tests/transformer_new.cpp new file mode 100644 index 000000000..2d1e89281 --- /dev/null +++ b/src/tests/transformer_new.cpp @@ -0,0 +1,11 @@ +#include "marian.h" +#include "models/transformer_new.h" + + +int main(int argc, char** argv) { + using namespace marian; + + testme(); + + return 0; +} From d225c24d7fa72372387fc63cbe1c118d14071fcb Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 1 Mar 2023 13:48:09 +0000 Subject: [PATCH 223/254] Merged PR 28128: Comet scoring and training with new layer framework This PR adds: * code for comet scoring and training with the new layer framework * conversion scripts from Unbabel comet to Marian model --- CHANGELOG.md | 8 +- VERSION | 2 +- scripts/bert/contrib/chpt2pt.py | 23 ++ scripts/bert/contrib/hugging2marian.py | 153 ++++++++++++ scripts/bert/contrib/roberta2marian.py | 163 ++++++++++++ scripts/comet/comet2marian.py | 216 ++++++++++++++++ src/common/aliases.cpp | 28 +++ src/common/config_parser.cpp | 11 + src/common/file_stream.cpp | 2 +- src/data/corpus_base.cpp | 8 +- src/data/corpus_base.h | 1 + src/data/sentencepiece_vocab.cpp | 3 +- src/embedder/embedder.h | 20 +- src/embedder/vector_collector.cpp | 20 +- src/embedder/vector_collector.h | 3 +- src/functional/operators.h | 12 +- src/graph/expression_operators.cpp | 2 +- src/graph/node_operators_unary.h | 36 ++- src/layers/embedding.cpp | 7 + src/layers_new/embeddings.h | 6 +- src/layers_new/transformer.h | 4 +- src/models/bert.h | 1 + src/models/comet_qe.h | 327 +++++++++++++++++++++++++ src/models/encoder_pooler.h | 7 +- src/models/model_factory.cpp | 60 ++++- src/tensors/gpu/add.inc | 7 +- src/tensors/gpu/add_all.inc | 2 + src/tensors/gpu/element.inc | 2 + src/training/graph_group.cpp | 9 +- src/training/validator.cpp | 113 +++++++++ src/training/validator.h | 19 ++ 31 files changed, 1236 insertions(+), 39 deletions(-) create mode 100644 scripts/bert/contrib/chpt2pt.py create mode 100644 scripts/bert/contrib/hugging2marian.py create mode 100644 scripts/bert/contrib/roberta2marian.py create mode 100644 scripts/comet/comet2marian.py create mode 100644 src/models/comet_qe.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a7316be9..6aff5037f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added - +- Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian. +- Validator that generates embeddings and can be used during COMET training with an external script. - New experimental layer framework for Transformer-like models. ### Fixed -- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp) +- Only collect batch statistics during mini-batch-fit up to actual max-length. +- Implemented fully correct version of GELU instead of using bad approximatin via Swish. +- Handle copying from fp32 or fp16 embeddings in embedder mode correctly. +- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp). ## [1.12.0] - 2023-02-20 diff --git a/VERSION b/VERSION index 41de27dfa..00f862625 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.2 +v1.12.3 diff --git a/scripts/bert/contrib/chpt2pt.py b/scripts/bert/contrib/chpt2pt.py new file mode 100644 index 000000000..3ca8fee6a --- /dev/null +++ b/scripts/bert/contrib/chpt2pt.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +""" +This script converts *.chpt files to *.pt files, potentially useful for extracting weights only from larger checkpoints. +""" + +import torch +import argparse + +# Create a parser for command line arguments +parser = argparse.ArgumentParser() + +# Add arguments for the source and target files +parser.add_argument("--source", type=str, required=True, help="Path to the source *.chpt file") +parser.add_argument("--target", type=str, required=True, help="Path to the target *.pt file") + +# Parse the command line arguments +args = parser.parse_args() + +# Load the model from the source file +model = torch.load(args.source) + +# Save the model to the target file +torch.save(model, args.target) \ No newline at end of file diff --git a/scripts/bert/contrib/hugging2marian.py b/scripts/bert/contrib/hugging2marian.py new file mode 100644 index 000000000..0ee31414a --- /dev/null +++ b/scripts/bert/contrib/hugging2marian.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +This script converts Huggingface Bert model to Marian weight file. +""" + +import argparse +import numpy as np +import sys +import yaml + +from transformers import XLMRobertaModel + +parser = argparse.ArgumentParser(description='Convert Huggingface Bert model to Marian weight file.') +parser.add_argument('--bert', help='Path to Huggingface Bert PyTorch model', required=True) +parser.add_argument('--marian', help='Output path for Marian weight file', required=True) +args = parser.parse_args() + +huggingface = XLMRobertaModel.from_pretrained(args.bert) +huggingface.eval() + +print(huggingface.config) + +config = dict() +config["type"] = "bert-classifier" +config["input-types"] = ["sequence"] +config["tied-embeddings-all"] = True +config["tied-embeddings-src"] = False + +config["transformer-ffn-depth"] = 2 +config["transformer-train-position-embeddings"] = True +config["transformer-preprocess"] = "" +config["transformer-postprocess"] = "dan" +config["transformer-postprocess-emb"] = "nd" +config["bert-train-type-embeddings"] = False +# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string. +config["version"] = "huggingface2marian.py conversion" + +config["enc-depth"] = 0 +config["transformer-dim-ffn"] = huggingface.config.intermediate_size +config["transformer-heads"] = huggingface.config.num_attention_heads +config["transformer-ffn-activation"] = huggingface.config.hidden_act + +config["bert-sep-symbol"] = "" +config["bert-class-symbol"] = "" + +marianModel = dict() + +def transposeOrder(mat): + matT = np.transpose(mat) # just a view with changed row order + return matT.flatten(order="C").reshape(matT.shape) # force row order change and reshape + + +def convert(pd, srcs, trg, transpose=True, bias=False): + if len(srcs) == 1: + for src in srcs: + num = pd[src].detach().numpy() + if bias: + marianModel[trg] = np.atleast_2d(num) + else: + if transpose: + marianModel[trg] = transposeOrder(num) # transpose with row order change + else: + marianModel[trg] = num + else: # path that joins matrices together for fused self-attention + nums = [pd[src].detach().numpy() for src in srcs] + if bias: + nums = [np.transpose(np.atleast_2d(num)) for num in nums] + marianModel[trg] = np.stack(nums, axis=0) + + +def extract(layer, nth, level): + name = type(layer).__name__ + print(" " * level, nth, name) + if name == "BertLayer": + pd = dict(layer.named_parameters()) + for n in pd: + print(" " * (level + 1), n, pd[n].shape) + + convert(pd, ["attention.self.query.weight"], f"encoder_l{nth + 1}_self_Wq", transpose=True) + convert(pd, ["attention.self.key.weight"], f"encoder_l{nth + 1}_self_Wk") + convert(pd, ["attention.self.value.weight"], f"encoder_l{nth + 1}_self_Wv") + + convert(pd, ["attention.self.query.bias"], f"encoder_l{nth + 1}_self_bq", bias=True) + convert(pd, ["attention.self.key.bias"], f"encoder_l{nth + 1}_self_bk", bias=True) + convert(pd, ["attention.self.value.bias"], f"encoder_l{nth + 1}_self_bv", bias=True) + + convert(pd, ["attention.output.dense.weight"], f"encoder_l{nth + 1}_self_Wo") + convert(pd, ["attention.output.dense.bias"], f"encoder_l{nth + 1}_self_bo", bias=True) + + convert(pd, ["attention.output.LayerNorm.weight"], f"encoder_l{nth + 1}_self_Wo_ln_scale", bias=True) + convert(pd, ["attention.output.LayerNorm.bias"], f"encoder_l{nth + 1}_self_Wo_ln_bias", bias=True) + + convert(pd, ["intermediate.dense.weight"], f"encoder_l{nth + 1}_ffn_W1") + convert(pd, ["intermediate.dense.bias"], f"encoder_l{nth + 1}_ffn_b1", bias=True) + convert(pd, ["output.dense.weight"], f"encoder_l{nth + 1}_ffn_W2") + convert(pd, ["output.dense.bias"], f"encoder_l{nth + 1}_ffn_b2", bias=True) + + convert(pd, ["output.LayerNorm.weight"], f"encoder_l{nth + 1}_ffn_ffn_ln_scale", bias=True) + convert(pd, ["output.LayerNorm.bias"], f"encoder_l{nth + 1}_ffn_ffn_ln_bias", bias=True) + + config["enc-depth"] += 1 + + elif name == "BertEmbeddings": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + convert(pd, ["word_embeddings.weight"], f"Wemb", transpose=False) + convert(pd, ["position_embeddings.weight"], f"Wpos", transpose=False) + + config["bert-type-vocab-size"] = 0 + if hasattr(layer, "token_type_embeddings"): + convert(pd, ["token_type_embeddings.weight"], f"Wtype", transpose=False) + config["bert-type-vocab-size"] = pd["token_type_embeddings.weight"].shape[0] + config["bert-train-type-embeddings"] = True + + convert(pd, ["LayerNorm.weight"], f"encoder_emb_ln_scale_pre", bias=True) + convert(pd, ["LayerNorm.bias"], f"encoder_emb_ln_bias_pre", bias=True) + + config["dim-emb"] = pd["word_embeddings.weight"].shape[1] + config["dim-vocabs"] = [ pd["word_embeddings.weight"].shape[0] ] + config["max-length"] = pd["position_embeddings.weight"].shape[0] + + elif name == "BertPooler": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + + pd = dict(layer.named_parameters()) + convert(pd, ["dense.weight"], "classifier_ff_logit_l1_W") + convert(pd, ["dense.bias"], "classifier_ff_logit_l1_b", bias=True) + + else: + recurse(layer, level + 1) + +def recurse(parent, level=0): + for i, child in enumerate(parent.children()): + extract(child, i, level) + +recurse(huggingface) + +for m in marianModel: + print(m, marianModel[m].shape) + +configYamlStr = yaml.dump(config, default_flow_style=False) +desc = list(configYamlStr) +npDesc = np.chararray((len(desc),)) +npDesc[:] = desc +npDesc.dtype = np.int8 +marianModel["special:model.yml"] = npDesc + +print("\nMarian config:") +print(configYamlStr) +print("Saving Marian model to %s" % (args.marian,)) +np.savez(args.marian, **marianModel) \ No newline at end of file diff --git a/scripts/bert/contrib/roberta2marian.py b/scripts/bert/contrib/roberta2marian.py new file mode 100644 index 000000000..fb80733f4 --- /dev/null +++ b/scripts/bert/contrib/roberta2marian.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +This script converts Fairseq Roberta model to Marian weight file. +""" + +import argparse +import numpy as np +import sys +import torch +import yaml + +from fairseq.models.roberta import RobertaModel + +parser = argparse.ArgumentParser(description='Convert Fairseq Roberta model to Marian weight file.') +parser.add_argument('--roberta', help='Path to Roberta model', required=True) +parser.add_argument('--comet', help='Path to COMET model', required=True) +parser.add_argument('--marian', help='Output path for Marian weight file', required=True) +args = parser.parse_args() + +roberta = RobertaModel.from_pretrained(args.roberta) +model = torch.load(args.comet) +print(model) + +roberta.eval() + +config = dict() +config["type"] = "bert-encoder" +config["input-types"] = ["sequence"] +config["tied-embeddings-all"] = True +config["tied-embeddings-src"] = False + +config["transformer-ffn-depth"] = 2 +config["transformer-ffn-activation"] = "gelu" # figure this out dynamically +config["transformer-train-position-embeddings"] = True +config["transformer-preprocess"] = "" +config["transformer-postprocess"] = "dan" +config["transformer-postprocess-emb"] = "nd" +config["bert-train-type-embeddings"] = False +config["bert-type-vocab-size"] = 0 +# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string. +config["version"] = "roberta2marian.py conversion" + +config["enc-depth"] = 0 + +marianModel = dict() + +def convert(pd, srcs, trg, transpose=True, bias=False): + if len(srcs) == 1: + for src in srcs: + num = pd[src].detach().numpy() + if bias: + marianModel[trg] = np.atleast_2d(num).copy() + else: + if transpose: + marianModel[trg] = np.transpose(num).copy() + else: + marianModel[trg] = num + else: # path that joins matrices together for fused self-attention + nums = [pd[src].detach().numpy() for src in srcs] + if bias: + nums = [np.transpose(np.atleast_2d(num)) for num in nums] + marianModel[trg] = np.stack(nums, axis=0).copy() + + +def extract(layer, nth, level): + name = type(layer).__name__ + print(" " * level, nth, name) + if name == "TransformerSentenceEncoderLayer": + pd = dict(layer.named_parameters()) + for n in pd: + print(" " * (level + 1), n, pd[n].shape) + + convert(pd, ["self_attn.q_proj.weight"], f"encoder_l{nth + 1}_self_Wq") + convert(pd, ["self_attn.k_proj.weight"], f"encoder_l{nth + 1}_self_Wk") + convert(pd, ["self_attn.v_proj.weight"], f"encoder_l{nth + 1}_self_Wv") + + convert(pd, ["self_attn.q_proj.bias"], f"encoder_l{nth + 1}_self_bq", bias=True) + convert(pd, ["self_attn.k_proj.bias"], f"encoder_l{nth + 1}_self_bk", bias=True) + convert(pd, ["self_attn.v_proj.bias"], f"encoder_l{nth + 1}_self_bv", bias=True) + + # convert(pd, ["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"], f"encoder_l{nth + 1}_self_Wt") + # convert(pd, ["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"], f"encoder_l{nth + 1}_self_bt", bias=True) + + convert(pd, ["self_attn.out_proj.weight"], f"encoder_l{nth + 1}_self_Wo") + convert(pd, ["self_attn.out_proj.bias"], f"encoder_l{nth + 1}_self_bo", bias=True) + + convert(pd, ["self_attn_layer_norm.weight"], f"encoder_l{nth + 1}_self_Wo_ln_scale", bias=True) + convert(pd, ["self_attn_layer_norm.bias"], f"encoder_l{nth + 1}_self_Wo_ln_bias", bias=True) + + convert(pd, ["fc1.weight"], f"encoder_l{nth + 1}_ffn_W1") + convert(pd, ["fc1.bias"], f"encoder_l{nth + 1}_ffn_b1", bias=True) + convert(pd, ["fc2.weight"], f"encoder_l{nth + 1}_ffn_W2") + convert(pd, ["fc2.bias"], f"encoder_l{nth + 1}_ffn_b2", bias=True) + + convert(pd, ["final_layer_norm.weight"], f"encoder_l{nth + 1}_ffn_ffn_ln_scale", bias=True) + convert(pd, ["final_layer_norm.bias"], f"encoder_l{nth + 1}_ffn_ffn_ln_bias", bias=True) + + config["transformer-dim-ffn"] = pd["fc1.bias"].shape[-1] + config["transformer-heads"] = layer.self_attn.num_heads + config["enc-depth"] += 1 + + elif name == "Embedding": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + convert(pd, ["weight"], f"Wemb", transpose=False) + + config["dim-emb"] = pd["weight"].shape[1] + config["dim-vocabs"] = [ pd["weight"].shape[0] ] + + elif name == "LearnedPositionalEmbedding": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + convert(pd, ["weight"], f"Wpos", transpose=False) + + config["max-length"] = pd["weight"].shape[0] + + elif name == "RobertaLMHead": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + + pd = dict(layer.named_parameters()) + convert(pd, ["dense.weight"], f"masked-lm_ff_logit_l1_W") + convert(pd, ["dense.bias"], f"masked-lm_ff_logit_l1_b", bias=True) + convert(pd, ["layer_norm.weight"], f"masked-lm_ff_ln_scale", bias=True) + convert(pd, ["layer_norm.bias"], f"masked-lm_ff_ln_bias", bias=True) + + convert(pd, ["bias"], f"masked-lm_ff_logit_l2_b", bias=True) + # reuse Wemb here as weight + # convert(pd, ["weight"], f"masked-lm_ff_logit_l2_b") + + elif name == "LayerNorm": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + + pd = dict(layer.named_parameters()) + convert(pd, ["weight"], f"encoder_emb_ln_scale_pre", bias=True) + convert(pd, ["bias"], f"encoder_emb_ln_bias_pre", bias=True) + + else: + recurse(layer, level + 1) + +def recurse(parent, level=0): + for i, child in enumerate(parent.children()): + extract(child, i, level) + +recurse(roberta) + +for m in marianModel: + print(m, marianModel[m].shape) + +configYamlStr = yaml.dump(config, default_flow_style=False) +desc = list(configYamlStr) +npDesc = np.chararray((len(desc),)) +npDesc[:] = desc +npDesc.dtype = np.int8 +marianModel["special:model.yml"] = npDesc + +print("\nMarian config:") +print(configYamlStr) +print("Saving Marian model to %s" % (args.marian,)) +np.savez(args.marian, **marianModel) \ No newline at end of file diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py new file mode 100644 index 000000000..9ddbb45c1 --- /dev/null +++ b/scripts/comet/comet2marian.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +This script converts Unbabel COMET-QE models to Marian weight file. +""" + +import argparse +import yaml +import numpy as np + +parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.') +inputs = parser.add_mutually_exclusive_group(required=True) +inputs.add_argument('--comet', help='Path to COMET model') +inputs.add_argument('--roberta', help='Initialize with Roberta model', action='store_true') +parser.add_argument('--marian', help='Output path for Marian weight file', required=True) +parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true') +args = parser.parse_args() + + +if args.roberta: + from transformers import AutoModel + # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large + robertaModel = AutoModel.from_pretrained("microsoft/infoxlm-large", add_pooling_layer=False) + robertaModel.eval() + print(robertaModel) + cometModel = robertaModel +else: + from comet import load_from_checkpoint + cometModel = load_from_checkpoint(args.comet) + cometModel.eval() + print(cometModel) + +marianModel = dict() + +config = dict() +config["type"] = "comet-qe" +config["tied-embeddings-all"] = True +config["tied-embeddings-src"] = False +config["transformer-ffn-depth"] = 2 +config["transformer-ffn-activation"] = "gelu" # figure this out dynamically +config["transformer-train-position-embeddings"] = True +config["transformer-preprocess"] = "" +config["transformer-postprocess"] = "dan" +config["transformer-postprocess-emb"] = "nd" +config["bert-train-type-embeddings"] = False +config["bert-type-vocab-size"] = 0 +config["comet-prepend-zero"] = True +config["comet-final-sigmoid"] = args.add_sigmoid +config["comet-pooler-ffn"] = [2048, 1024] +# @TODO: figure out if it's worth adding `cometModel.name_or_path` to the end of this version string. +config["version"] = "comet2marian2.py conversion" +config["enc-depth"] = 0 + +def yaml2np(config): + configYamlStr = yaml.dump(config, default_flow_style=False) + print("\nMarian config:") + print(configYamlStr) + + desc = bytes(configYamlStr, 'ascii') + b'\x00' + npDesc = np.chararray((len(desc),)) + npDesc.dtype = np.int8 + for i, b in enumerate(desc): + npDesc[i] = b + return npDesc + +def convert(pd, srcs, trg, transpose=True, bias=False): + if len(srcs) == 1: + for src in srcs: + num = pd[src].detach().numpy() + if bias: + marianModel[trg] = num.copy() + else: + if transpose: + marianModel[trg] = np.transpose(num).copy() + else: + marianModel[trg] = num + else: # path that joins matrices together for fused self-attention + nums = [pd[src].detach().numpy() for src in srcs] + if bias: + nums = [np.transpose(num) for num in nums] + marianModel[trg] = np.stack(nums, axis=0).copy() + +def extract(layer, nth, level): + name = type(layer).__name__ + print(" " * level, nth, name) + if "RobertaLayer" in name: + pd = dict(layer.named_parameters()) + for n in pd: + print(" " * (level + 1), n, pd[n].shape) + + prefix = "CometEncoder" + + blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->selfAttentionBlock" + + # self-attention + # query transformation + convert(pd, ["attention.self.query.weight"], f"{blockPrefix}->selfAttention->qProj->weight") + convert(pd, ["attention.self.query.bias"], f"{blockPrefix}->selfAttention->qProj->bias", bias=True) + + # key transformation + convert(pd, ["attention.self.key.weight"], f"{blockPrefix}->selfAttention->kProj->weight") + convert(pd, ["attention.self.key.bias"], f"{blockPrefix}->selfAttention->kProj->bias", bias=True) + + # values transformation + convert(pd, ["attention.self.value.weight"], f"{blockPrefix}->selfAttention->vProj->weight") + convert(pd, ["attention.self.value.bias"], f"{blockPrefix}->selfAttention->vProj->bias", bias=True) + + # output transformation + convert(pd, ["attention.output.dense.weight"], f"{blockPrefix}->selfAttention->oProj->weight") + convert(pd, ["attention.output.dense.bias"], f"{blockPrefix}->selfAttention->oProj->bias", bias=True) + + # self-attention layer-norm + convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["attention.output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + + # ffn + # first ffn layer + blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->filterBlock" + + convert(pd, ["intermediate.dense.weight"], f"{blockPrefix}->layers->at(0)->as()->weight") + convert(pd, ["intermediate.dense.bias"], f"{blockPrefix}->layers->at(0)->as()->bias", bias=True) + # second ffn layer + convert(pd, ["output.dense.weight"], f"{blockPrefix}->layers->at(3)->as()->weight") + convert(pd, ["output.dense.bias"], f"{blockPrefix}->layers->at(3)->as()->bias", bias=True) + # ffn layer-norm + convert(pd, ["output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + + config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1] + config["transformer-heads"] = layer.attention.self.num_attention_heads + config["enc-depth"] += 1 + + elif "RobertaEmbeddings" in name: + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + + # shift word embeddings so that we are back at 250,000 vocab items + npWembTemp = pd["word_embeddings.weight"].detach().numpy() + npWemb = npWembTemp[1:-1, :].copy() + npWemb[0, :] = npWembTemp[0, :] + npWemb[2, :] = npWembTemp[2, :] + marianModel["Wemb"] = npWemb + + prefix = "CometEncoder" + + # shift position embeddings so that we are back at 512 items and start at 0 + npPos = pd["position_embeddings.weight"].detach().numpy() + npPos = npPos[2:, :].copy() + marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos + + # post-embedding layer normalization + convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True) + convert(pd, ["LayerNorm.bias"], f"{prefix}->encoder->preprocessor->norm->bias", bias=True) + + config["dim-emb"] = npWemb.shape[1] + config["dim-vocabs"] = [ npWemb.shape[0] ] + config["max-length"] = npPos.shape[0] + + elif name == "LayerwiseAttention": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + + # mix layers + weights = [] + for i in range(25): + weights.append(pd[f"scalar_parameters.{i}"].detach().numpy()) + marianModel["CometEncoder->encoder->weights"] = np.concatenate(weights).copy() + + # gamma for weird batch/layer-norm step in pooler/encoder of COMET + # @TODO: make optional + marianModel["CometEncoder->encoder->gamma"] = pd["gamma"].detach().numpy().copy() + config["comet-mix"] = True + config["comet-mix-norm"] = True + + + elif name == "FeedForward": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + + if layer.ff[-1].__class__.__name__ == "Sigmoid" or args.add_sigmoid: + config["comet-final-sigmoid"] = True + + config["comet-pooler-ffn"] = [ + pd["ff.0.bias"].shape[0], + pd["ff.3.bias"].shape[0] + ] + + # 3-layer FFN network that computes COMET regression + prefix = "CometQEPooler" + + # @TODO: make final sigmoid optional + convert(pd, ["ff.0.weight"], f"{prefix}->layers->at(0)->as()->weight") + convert(pd, ["ff.0.bias"], f"{prefix}->layers->at(0)->as()->bias", bias=True) + + convert(pd, ["ff.3.weight"], f"{prefix}->layers->at(3)->as()->weight") + convert(pd, ["ff.3.bias"], f"{prefix}->layers->at(3)->as()->bias", bias=True) + + convert(pd, ["ff.6.weight"], f"{prefix}->layers->at(6)->as()->weight") + convert(pd, ["ff.6.bias"], f"{prefix}->layers->at(6)->as()->bias", bias=True) + else: + recurse(layer, level + 1) + +def recurse(parent, level=0): + for i, child in enumerate(parent.children()): + extract(child, i, level) + +recurse(cometModel) +marianModel["special:model.yml"] = yaml2np(config) + +for m in marianModel: + print(m, marianModel[m].shape) + +print("Saving Marian model to %s" % (args.marian,)) +np.savez(args.marian, **marianModel) diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 75d9bdf97..653ca6f8a 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -227,6 +227,34 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { config["valid-mini-batch"] = 8; config["normalize"] = 1.0; }); + + // Model architecture for Unbabel's COMET-QE models + cli.alias("task", "comet-qe", [](YAML::Node& config) { + // Model options + config["bert-train-type-embeddings"] = false; + config["bert-type-vocab-size"] = 0; + config["comet-final-sigmoid"] = true; + config["comet-mix"] = false; + config["comet-mix-norm"] = false; + config["comet-dropout"] = 0.1; + config["comet-pooler-ffn"] = std::vector({2048, 1024}); + config["comet-prepend-zero"] = true; + config["dim-emb"] = 1024; + config["dim-vocabs"] = std::vector({250000}); + config["enc-depth"] = 24; + config["max-length"] = 512; + config["valid-max-length"] = 512; + config["tied-embeddings-all"] = true; + config["transformer-dim-ffn"] = 4096; + config["transformer-ffn-activation"] = "gelu"; + config["transformer-ffn-depth"] = 2; + config["transformer-heads"] = 16; + config["transformer-postprocess"] = "dan"; + config["transformer-postprocess-emb"] = "nd"; + config["transformer-preprocess"] = ""; + config["transformer-train-position-embeddings"] = true; + config["type"] = "comet-qe"; + }); } } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 4cc23f2ca..aaeeb514b 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -323,6 +323,17 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--bert-masking-fraction", "Fraction of masked out tokens during training", 0.15f); cli.add("--bert-train-type-embeddings", "Train bert type embeddings, set to false to use static sinusoidal embeddings", true); cli.add("--bert-type-vocab-size", "Size of BERT type vocab (sentence A and B)", 2); + + // Options specific for the "comet-qe" model type + cli.add("--comet-final-sigmoid", "Add final sigmoid to COMET model"); + cli.add("--comet-mix", "Mix encoder layers to produce embedding"); + cli.add("--comet-mix-norm", "Normalize layers prior to mixing"); + cli.add("--comet-dropout", "Dropout for pooler layers", 0.1f); + cli.add("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f); + cli.add("--comet-mixup-reg", "Use original and mixed-up samples in training"); + cli.add>("--comet-pooler-ffn", "Hidden sizes for comet pooler", {2048, 1024}); + cli.add("--comet-prepend-zero", "Add a start symbol to batch entries"); + #ifdef CUDNN cli.add("--char-stride", "Width of max-pooling layer after convolution layer in char-s2s model", diff --git a/src/common/file_stream.cpp b/src/common/file_stream.cpp index e1572f62e..e2870b17a 100644 --- a/src/common/file_stream.cpp +++ b/src/common/file_stream.cpp @@ -97,7 +97,7 @@ OutputFileStream::OutputFileStream(const std::string &file) : std::ostream(NULL), file_(file) { streamBuf1_.reset(new std::filebuf()); auto ret = static_cast(streamBuf1_.get())->open(file.c_str(), std::ios::out | std::ios_base::binary); - ABORT_IF(!ret, "File cannot be opened", file); + ABORT_IF(!ret, "Error opening file ({}): {}", errno, file_.string()); ABORT_IF(ret != streamBuf1_.get(), "Return value is not equal to streambuf pointer, that is weird"); if(file_.extension() == marian::filesystem::Path(".gz")) { diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index d276ca6bc..a429ae2f3 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -59,6 +59,7 @@ CorpusBase::CorpusBase(const std::vector& paths, maxLength_(options_->get("max-length")), maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), + prependZero_(options_->get("comet-prepend-zero", false)), tsv_(options_->get("tsv", false)), tsvNumInputFields_(getNumberOfTSVInputFields(options)) { // TODO: support passing only one vocab file if we have fully-tied embeddings @@ -84,6 +85,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) maxLength_(options_->get("max-length")), maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), + prependZero_(options_->get("comet-prepend-zero", false)), tsv_(options_->get("tsv", false)), tsvNumInputFields_(getNumberOfTSVInputFields(options)) { bool training = !translate; @@ -420,9 +422,13 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece // is used. Words words = vocabs_[batchIndex]->encode(line, /*addEOS =*/ addEOS_[batchIndex], inference_); - ABORT_IF(words.empty(), "Empty input sequences are presently untested"); + auto inputTypes = options_->get>("input-types", {}); // empty list by default + + if(prependZero_ && inputTypes[batchIndex] == "sequence") + words.insert(words.begin(), Word::fromWordIndex(0)); + if(maxLengthCrop_ && words.size() > maxLength_) { words.resize(maxLength_); if(addEOS_[batchIndex]) diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 2e572ebd8..123250d97 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -638,6 +638,7 @@ class CorpusBase : public DatasetBaseSampleEncode(line, -1, alpha_, &spmIds); - Words words; words.reserve(spmIds.size() + addEOS); + Words words; + words.reserve(spmIds.size() + addEOS); for (auto&& spmId : spmIds) words.push_back(Word::fromWordIndex(spmId)); diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h index d45e14cd3..ebd9782e2 100644 --- a/src/embedder/embedder.h +++ b/src/embedder/embedder.h @@ -19,7 +19,7 @@ using namespace data; /* * The tool is used to create output sentence embeddings from available - * Marian encoders. With --compute-similiarity and can return the cosine + * Marian encoders. With --compute-similiarity it can return the cosine * similarity between two sentences provided from two sources. */ class Embedder { @@ -56,8 +56,7 @@ class Embed : public ModelTask { Embed(Ptr options) : options_(options) { options_ = options_->with("inference", true, - "shuffle", "none", - "input-types", std::vector({"sequence"})); + "shuffle", "none"); // if a similarity is computed then double the input types and vocabs for // the two encoders that are used in the model. @@ -109,7 +108,7 @@ class Embed : public ModelTask { auto batchGenerator = New>(corpus_, options_); batchGenerator->prepare(); - auto output = New(options_); + auto output = New(options_->get("output"), options_->get("binary")); size_t batchId = 0; { @@ -128,8 +127,19 @@ class Embed : public ModelTask { auto embeddings = builder->build(graph, batch); graph->forward(); + // handle copying from fp32 or fp16 embeddings correctly. std::vector sentVectors; - embeddings->val()->get(sentVectors); + if(embeddings->value_type() == Type::float32) { + embeddings->val()->get(sentVectors); + } else if (embeddings->value_type() == Type::float16) { + std::vector sentVectors16; + embeddings->val()->get(sentVectors16); + sentVectors.reserve(sentVectors16.size()); + for(auto& v: sentVectors16) + sentVectors.push_back(v); + } else { + ABORT("Unknown embedding type {}", embeddings->value_type()); + } // collect embedding vector per sentence. // if we compute similarities this is only one similarity per sentence pair. diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp index c1caf2f7b..11b07b43b 100644 --- a/src/embedder/vector_collector.cpp +++ b/src/embedder/vector_collector.cpp @@ -11,14 +11,17 @@ namespace marian { // This class manages multi-threaded writing of embedded vectors to stdout or an output file. // It will either output string versions of float vectors or binary equal length versions depending // on its binary_ flag. +VectorCollector::VectorCollector(bool binary) + : nextId_(0), + binary_(binary) {} -VectorCollector::VectorCollector(const Ptr& options) - : nextId_(0), binary_{options->get("binary", false)} { - if(options->get("output") == "stdout") - outStrm_.reset(new std::ostream(std::cout.rdbuf())); - else - outStrm_.reset(new io::OutputFileStream(options->get("output"))); - } +VectorCollector::VectorCollector(std::string outFile, bool binary) + : nextId_(0), + outStrm_(new std::ostream(std::cout.rdbuf())), + binary_(binary) { + if (outFile != "stdout") + outStrm_.reset(new io::OutputFileStream(outFile)); +} void VectorCollector::Write(long id, const std::vector& vec) { std::lock_guard lock(mutex_); @@ -60,8 +63,7 @@ void VectorCollector::WriteVector(const std::vector& vec) { if(binary_) { outStrm_->write((char*)vec.data(), vec.size() * sizeof(float)); } else { - std::stringstream ss; - ss << std::fixed << std::setprecision(8); + *outStrm_ << std::fixed << std::setprecision(4); for(auto v : vec) *outStrm_ << v << " "; *outStrm_ << std::endl; diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h index 80110958a..fc39ea6ec 100644 --- a/src/embedder/vector_collector.h +++ b/src/embedder/vector_collector.h @@ -14,7 +14,8 @@ namespace marian { // on its binary_ flag. class VectorCollector { public: - VectorCollector(const Ptr& options); + VectorCollector(bool binary=false); + VectorCollector(std::string outFile, bool binary=false); virtual ~VectorCollector() {} virtual void Write(long id, const std::vector& vec); diff --git a/src/functional/operators.h b/src/functional/operators.h index 80b40ff40..3628fdcb9 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -1,6 +1,8 @@ #pragma once #include "common/types.h" + +#define _USE_MATH_DEFINES #include namespace marian { @@ -24,7 +26,8 @@ struct Ops { static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); } static HOST_DEVICE_INLINE T neg(const T&) { ABORT("Unknown type"); } static HOST_DEVICE_INLINE T sgn(const T&) { ABORT("Unknown type"); } - + static HOST_DEVICE_INLINE T erf(const T&) { ABORT("Unknown type"); } + static HOST_DEVICE_INLINE T round(const T&) { ABORT("Unknown type"); } static HOST_DEVICE_INLINE T floor(const T&) { ABORT("Unknown type"); } static HOST_DEVICE_INLINE T ceil(const T&) { ABORT("Unknown type"); } @@ -82,6 +85,7 @@ struct Ops { static HOST_DEVICE_INLINE float sqrt(const float& x) { return sqrtf(x); } static HOST_DEVICE_INLINE float neg(const float& x) { return -x; } static HOST_DEVICE_INLINE float sgn(const float& x) { return (float)((0 < x) - (x < 0)); } + static HOST_DEVICE_INLINE float erf(const float& x) { return erff(x); } static HOST_DEVICE_INLINE float round(const float& x) { return roundf(x); } static HOST_DEVICE_INLINE float floor(const float& x) { return floorf(x); } @@ -151,6 +155,7 @@ struct Ops { static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); } static HOST_DEVICE_INLINE double neg(const double& x) { return -x; } static HOST_DEVICE_INLINE double sgn(const double& x) { return (0 < x) - (x < 0); } + static HOST_DEVICE_INLINE double erf(const double& x) { return std::erf(x); } static HOST_DEVICE_INLINE double round(const double& x) { return std::round(x); } static HOST_DEVICE_INLINE double floor(const double& x) { return std::floor(x); } @@ -265,6 +270,7 @@ struct Ops { // @TODO: get rid of loop4 with proper intrisics static inline float32x4 sgn(const float32x4& x) { return loop4(Ops::sgn, x); } + static inline float32x4 erf(const float32x4& x) { return loop4(Ops::erf, x); } static inline float32x4 round(const float32x4& x) { return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } static inline float32x4 floor(const float32x4& x) { return _mm_floor_ps(x); } @@ -394,6 +400,7 @@ struct Ops { // @TODO: get rid of loop8 with proper intrisics static inline float32x8 sgn(const float32x8& x) { return loop8(Ops::sgn, x); } + static inline float32x8 erf(const float32x8& x) { return loop8(Ops::erf, x); } static inline float32x8 round(const float32x8& x) { return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } static inline float32x8 floor(const float32x8& x) { return _mm256_floor_ps(x); } @@ -494,6 +501,7 @@ struct Ops { #endif static DEVICE_INLINE half sgn(const half& x) { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right? + static DEVICE_INLINE half erf(const half& x) { return erff((float)x); } static DEVICE_INLINE half round(const half& x) { return hrint(x); } static DEVICE_INLINE half floor(const half& x) { return hfloor(x); } @@ -597,6 +605,7 @@ struct Ops { #endif static DEVICE_INLINE halfx2 sgn(const halfx2& x) { halfx2 zero(0.f, 0.f); return __hsub2(__hlt2(zero, x), __hlt2(x, zero)); } + static DEVICE_INLINE halfx2 erf(const halfx2& x) { return {Ops::erf(x[0]), Ops::erf(x[1])}; } static DEVICE_INLINE halfx2 round(const halfx2& x) { return h2rint(x); } static DEVICE_INLINE halfx2 floor(const halfx2& x) { return h2floor(x); } @@ -714,6 +723,7 @@ UNARY(Sqr, sqr, Ops::sqr(x)); UNARY(Sqrt, sqrt, Ops::sqrt(x)); UNARY(Neg, operator-, Ops::neg(x)); UNARY(Sgn, sgn, Ops::sgn(x)); +UNARY(Erf, erf, Ops::erf(x)); UNARY(Round, round, Ops::round(x)); UNARY(Floor, floor, Ops::floor(x)); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index a6504ebac..c928e8ce0 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -95,7 +95,7 @@ Expr swish(Expr a) { } Expr gelu(Expr a) { - return Expression(a, 1.702f); + return Expression(a); } Expr operator-(Expr a) { diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 27121fa6d..4e78e7166 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -1,16 +1,19 @@ #pragma once -#include "tensors/backend.h" -#include "tensors/tensor.h" - +#include "common/definitions.h" #include "functional/functional.h" #include "graph/node.h" +#include "tensors/backend.h" #include "tensors/tensor_operators.h" +#include "tensors/tensor.h" #ifdef CUDNN #include "tensors/gpu/cudnn_wrappers.h" #endif +#define _USE_MATH_DEFINES // enables math constants. We need M_PI +#include + namespace marian { struct UnaryNodeOp : public NaryNodeOp { @@ -417,6 +420,33 @@ struct SwishNodeOp : public UnaryNodeOp { float b_; }; +/** + * Represents a GELU node + * in an expression graph. + */ +struct GeluNodeOp : public UnaryNodeOp { + GeluNodeOp(Expr a) : UnaryNodeOp(a) {} + + NodeOps forwardOps() override { + using namespace functional; + return { + NodeOp(Element(_1 = 0.5f * _2 * (1.f + erf(_2 / sqrt(2.f))), val_, child(0)->val())) + }; + } + + NodeOps backwardOps() override { + using namespace functional; + auto erf_prime = (2.f / sqrt((float)M_PI)) * exp(-(_1 * _1) / 2.f); + auto dx = 0.5 * (erf(_1 / sqrt(2.f)) + _1 * erf_prime / sqrt(2.f) + 1.f); + return {NodeOp(Add(dx * _2, + child(0)->grad(), + child(0)->val(), + adj_))}; + } + + const std::string type() override { return "gelu"; } +}; + struct SoftmaxNodeOp : public UnaryNodeOp { SoftmaxNodeOp(Expr a) : UnaryNodeOp(a) {} diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index 334f0b865..85c14f51b 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -40,6 +40,13 @@ Embedding::Embedding(Ptr graph, Ptr options) } } +#if 0 + auto emb = graph_->get(name); + if(emb) { + dimVoc = emb->shape()[-2]; + } +#endif + E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed); } diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h index b7d297b63..e080906fe 100644 --- a/src/layers_new/embeddings.h +++ b/src/layers_new/embeddings.h @@ -202,9 +202,9 @@ struct LearnedPositionEmbedding : public PositionEmbeddingLayer { int dimEmb = input->shape()[-1]; int dimWords = input->shape()[positionAxis]; - registerParameter(embeddings, - Shape({maxLength, dimEmb}), - inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true)); + registerParameterLazy(embeddings, + Shape({maxLength, dimEmb}), + inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true)); ABORT_IF(start + dimWords > maxLength, "Number of positions ({}) starting at position {} exceeds maximum length {}", diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index 3302d9d85..8776820ef 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -212,7 +212,7 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. * @TODO: get rid of these transposes. */ -struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer { +struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { Ptr positionEmbedding; Ptr preprocessor; Ptr layers; @@ -250,6 +250,8 @@ struct TransformerEncoder final : public LayerWithOptions, public IBinaryLayer { registerLayer(postprocessor); } + virtual ~TransformerEncoder() = default; + Expr apply(Expr input, Expr mask = nullptr) const override { // first and last operations (see at the bottom of this function) switch the time and batch // dimensions. This order is more natural for the transformer, but more difficult to handle diff --git a/src/models/bert.h b/src/models/bert.h index 99dfae55e..1e0153e6c 100644 --- a/src/models/bert.h +++ b/src/models/bert.h @@ -238,6 +238,7 @@ class BertEncoder : public EncoderTransformer { ("prefix", "Wtype") ("dimVocab", dimTypeVocab) // sentence A or sentence B ("dimEmb", dimEmb) + ("inference", graph_->isInference()) .construct(graph_); signal = sentenceEmbeddings->applyIndices(bertBatch->bertSentenceIndices(), {dimWords, dimBatch, dimEmb}); } else { diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h new file mode 100644 index 000000000..cca18cac7 --- /dev/null +++ b/src/models/comet_qe.h @@ -0,0 +1,327 @@ +#pragma once + +#include "layers_new/transformer.h" + +#include "models/encoder.h" +#include "layers/constructors.h" + +namespace marian { +namespace models { + +struct CometEncoder final : public nn::TransformerEncoder { + Expr weights; + Expr gamma; + + CometEncoder(Ptr graph, + Ptr options) + : TransformerEncoder(graph, options) {} + + Expr apply(Expr input, Expr mask) const override { + auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] + + mask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] + auto binMask = mask; + mask = marian::nn::transposedLogMask(mask, opt("transformer-heads")); + + // apply positional embeddings to contextual input + output = positionEmbedding->apply(output); + + // handle for skip connection at top + auto prevOutput = output; + + // apply dropout or layer-norm to embeddings if required + output = preprocessor->apply(output); + + // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code. + // It norms over time, not batch, also should be optimized. Seems safe to disable for custom + // models trained by us, but required when doing inference with Unbabel models. + auto cometNorm = [&, this](Expr x, Expr binMask) { + if(opt("comet-mix-norm", false)) { + registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); + int dimModel = x->shape()[-1]; + + // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32. + Type origType = x->value_type(); + x = marian::cast(x, Type::float32); + binMask = marian::cast(binMask, Type::float32); + + x = x * binMask; + auto denom = (float)dimModel * sum(binMask, -2); + auto mu = sum(sum(x, -1), -2) / denom; // sum over model and time + auto sigma = sum(sum(square(x - mu), -1), -2) / denom; + + auto normed = (x - mu) / sqrt(sigma + 1e-12f); + auto output = marian::cast(gamma, Type::float32) * sum(normed * binMask, -2) / sum(binMask, -2); + + // Undo conversion to fp32 if not originally fp32 (most likely fp16 then) + return marian::cast(output, origType); + } else { + return sum(x * binMask, -2) / sum(binMask, -2); + } + }; + + std::vector pooler; + if(opt("comet-mix", false)) + pooler.push_back(cometNorm(output, binMask)); + + // traverse the layers, use the same mask for each + for(auto layer : *layers) { + output = layer->apply(output, mask); + if(opt("comet-mix", false)) + pooler.push_back(cometNorm(output, binMask)); // [ batch, time, modelDim ] + } + + if(opt("comet-mix", false)) { + registerParameterLazy(weights, Shape({ opt("enc-depth") + 1 }), inits::ones()); + auto weightsNorm = reshape(softmax(weights), {weights->shape()[-1], 1}); + output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim] + } else { + // just use last layer, average over time dim + output = cometNorm(output, binMask); // [batch, 1, modelDim] + } + + return output; + } +}; + +// Wrapper for backwards compatibility that uses current encoder/decoder framework +struct CometBatchEncoder final : public nn::LayerWithOptions, + public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings + public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? + Ptr encoder; + + CometBatchEncoder(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options), + EncoderBase(graph, options) + { + encoder = New(graph, options); + registerLayer(encoder); + } + + // @TODO: subBatch should be of type Expr + virtual std::tuple apply(Ptr subBatch) const override { + // @TODO: this is still using the bad old interface + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + const auto& [batchEmbedding, batchMask] = embeddingLayer->apply(subBatch); + + auto batchContext = encoder->apply(batchEmbedding, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return std::make_tuple(batchContext, batchMask); + } + + virtual Expr apply(const Words& words, const Shape& shape) const override final { + return applyIndices(toWordIndexVector(words), shape); + } + + // alternative from indices directly + virtual Expr applyIndices(const std::vector& wordIndices, const Shape& shape) const override final { + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape); + auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return batchContext; + } + + // @TODO: currently here for backwards compat, should be replaced with apply() + virtual Ptr build(Ptr graph, + Ptr batch) override { +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + EncoderBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); + return New(batchEmbedding, batchMask, batch); + } + + virtual void clear() override { + Layer::clear(); + } +}; + +class CometQEPooler final : public nn::LayerWithOptions, + public PoolerBase { +private: + Ptr layers; + std::mt19937 rng{(uint32_t)Config::seed}; + +public: + CometQEPooler(Ptr graph, Ptr options) + : LayerWithOptions(graph, options), + PoolerBase(graph, options) { + + float dropoutProb = LayerWithOptions::opt("comet-dropout", 0.1f); + auto ffnHidden = LayerWithOptions::opt>("comet-pooler-ffn", {2048, 1024}); + layers = New( + graph, + New(graph, ffnHidden[0]), + New(graph), + New(graph, dropoutProb), + New(graph, ffnHidden[1]), + New(graph), + New(graph, dropoutProb), + New(graph, 1) + ); + + if(LayerWithOptions::opt("comet-final-sigmoid")) + layers->append(New(graph)); + + registerLayer(layers); + } + + std::vector apply(Ptr graph, Ptr batch, const std::vector>& encoderStates) override { +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + PoolerBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + auto beta = [](float alpha, std::mt19937& gen) { + // Generate random numbers x and y from gamma distributions with the given alpha and beta parameters + std::gamma_distribution gamma(alpha, 1.f); + float x = gamma(gen); + float y = gamma(gen); + return x / (x + y); + }; + + auto mixup = [&](Expr x, Expr y, float alpha, bool reg=true) -> Expr2 { + if(alpha == 0.f) + return {x, y}; + + int dimBatch = x->shape()[-3]; + Type xType = x->value_type(); + + std::vector indices(dimBatch); + std::iota(indices.begin(), indices.end(), 0); + + // permute the indices and select batch entries accordingly + std::shuffle(indices.begin(), indices.end(), rng); + auto xPrime = index_select(x, -3, indices); + auto yPrime = index_select(y, -3, indices); + + std::vector lambdasVec(dimBatch); + std::generate(lambdasVec.begin(), lambdasVec.end(), [&]{ return beta(alpha, rng); }); + auto lambdas = graph->constant({dimBatch, 1, 1}, inits::fromVector(lambdasVec), Type::float32); + + auto xMixup = (1.f - marian::cast(lambdas, xType)) * x + marian::cast(lambdas, xType) * xPrime; + auto yMixup = (1.f - lambdas) * y + lambdas * yPrime; + + if(reg) { + // return original and mixed samples + xMixup = concatenate({x, xMixup}, /*axis=*/-2); + yMixup = concatenate({y, yMixup}, /*axis=*/-2); + } + + return {xMixup, yMixup}; + }; + + ABORT_IF(encoderStates.size() != 2, "Pooler expects exactly two encoder state"); + + auto src = encoderStates[0]->getContext(); + auto mt = encoderStates[1]->getContext(); + + auto diff = abs(mt - src); + auto prod = mt * src; + + Expr output; + if(LayerWithOptions::opt("usage") == (int)models::usage::embedding) { + auto embFwd = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] + auto embBwd = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model] + auto emb = concatenate({embFwd, embBwd}, /*axis=*/-2); + output = layers->apply(emb); + + int dimBatch = output->shape()[-3]; + output = reshape(output, {dimBatch, 1, 2}); + return { output }; + } else { + auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] + + auto softLabelsWords = batch->front()->data(); + auto classVocab = batch->front()->vocab(); + + int dimBatch = (int)softLabelsWords.size(); + std::vector softLabels; + for(auto w : softLabelsWords) { + // @TODO: this is a super-ugly hack to get regression values + float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f; + softLabels.push_back(score); + } + auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32); + + if(getMode() == Mode::train) { + float mixupAlpha = LayerWithOptions::opt("comet-mixup", 0.f); + bool mixupReg = LayerWithOptions::opt("comet-mixup-reg", false); + auto xy = mixup(emb, labels, mixupAlpha, mixupReg); + emb = get<0>(xy); + labels = get<1>(xy); + } + output = marian::cast(layers->apply(emb), Type::float32); + return { output, labels }; + } + } + + void clear() override {} +}; + +// Wraps an EncoderClassifier so it can produce a cost from raw logits. @TODO: Needs refactoring +class CometBinaryCE final : public ICost { +protected: + Ptr options_; + const bool inference_{false}; + const bool rescore_{false}; + +public: + CometBinaryCE(Ptr options) + : options_(options), inference_(options->get("inference", false)), + rescore_(options->get("cost-type", "ce-sum") == "ce-rescore") { } + + Ptr apply(Ptr model, + Ptr graph, + Ptr batch, + bool clearGraph = true) override { + auto encpool = std::static_pointer_cast(model); + auto corpusBatch = std::static_pointer_cast(batch); + + auto inputTypes = options_->get>("input-types", {}); + ABORT_IF(inputTypes != std::vector({"class", "sequence", "sequence"}), + "Expected input-types to be have fields (class, sequence, sequence)"); + ABORT_IF(corpusBatch->sets() != 3, "Expected 3 sub-batches, not {}", corpusBatch->sets()); + + auto lossFn = [&](Expr x, Expr y) { + float eps = 1e-5f; + if(!options_->get("comet-final-sigmoid")) + x = sigmoid(x); + return -(y * log(x + eps) + (1.f - y) * log((1.f + eps) - x)); + }; + + auto encoded = encpool->apply(graph, corpusBatch, clearGraph); + + Expr x = encoded[0]; + Expr y = encoded[1]; + auto loss = lossFn(x, y); + + loss = mean(loss, /*axis=*/-2); // this should only do something with mixup regularization + + int dimBatch = loss->shape()[-3]; + if(rescore_) + loss = reshape(loss, {1, dimBatch, 1}); + else + loss = sum(loss, /*axis=*/-3); // [1, 1, 1] + + Ptr multiLoss = New(); + RationalLoss lossPiece(loss, (float)dimBatch); + multiLoss->push_back(lossPiece); + + return multiLoss; + } +}; + +} // namespace models +} // namespace marian + diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index 124d873c5..0a781c9d5 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -122,7 +122,6 @@ class EncoderPooler : public EncoderPoolerBase { "skip", "layer-normalization", "right-left", - "input-types", "special-vocab", "tied-embeddings", "tied-embeddings-src", @@ -158,6 +157,12 @@ class EncoderPooler : public EncoderPoolerBase { modelFeatures_.insert("lemma-dependency"); modelFeatures_.insert("factors-combine"); modelFeatures_.insert("factors-dim-emb"); + + modelFeatures_.insert("comet-prepend-zero"); + modelFeatures_.insert("comet-pooler-ffn"); + modelFeatures_.insert("comet-final-sigmoid"); + modelFeatures_.insert("comet-mix"); + modelFeatures_.insert("comet-mix-norm"); } virtual Ptr getOptions() override { return options_; } diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 17ee2a4d9..40ba122a6 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -1,5 +1,7 @@ #include "marian.h" +#include "common/fastopt.h" + #include "models/model_factory.h" #include "models/encoder_decoder.h" #include "models/encoder_classifier.h" @@ -14,6 +16,8 @@ #include "models/transformer_factory.h" #include "models/transformer_new.h" +#include "models/comet_qe.h" + #ifdef CUDNN #include "models/char_s2s.h" #endif @@ -46,7 +50,7 @@ Ptr EncoderFactory::construct(Ptr graph) { if(options_->get("type") == "bert-encoder") return New(graph, options_); - ABORT("Unknown encoder type"); + ABORT("Unknown encoder type {}", options_->get("type")); } Ptr DecoderFactory::construct(Ptr graph) { @@ -69,7 +73,7 @@ Ptr ClassifierFactory::construct(Ptr graph) { Ptr PoolerFactory::construct(Ptr graph) { if(options_->get("type") == "max-pooler") return New(graph, options_); - if(options_->get("type") == "slice-pooler") + else if(options_->get("type") == "slice-pooler") return New(graph, options_); else if(options_->get("type") == "sim-pooler") return New(graph, options_); @@ -136,6 +140,34 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti size_t fields = trainEmbedderRank ? dimVocabs.size() : 0; int dimVocab = dimVocabs[0]; + if(type == "comet-qe") { + auto newOptions = options->with("usage", use); + auto res = New(newOptions); + + auto inputTypes = options->get>("input-types"); + ABORT_IF(inputTypes.empty(), + "Required option --input-types for COMET-QE not set. " + "For inference that should be --input-types sequence sequence. " + "For training set --input-types class sequence sequence"); + + int shift = 0; + if(inputTypes[0] == "class") + shift = 1; + + auto enc1 = New(graph, newOptions->with("type", "transformer", "index", 0 + shift)); + enc1->setName("CometEncoder"); + res->push_back(enc1); + + auto enc2 = New(graph, newOptions->with("type", "transformer", "index", 1 + shift)); + enc2->setName("CometEncoder"); + res->push_back(enc2); + + auto pooler = New(graph, newOptions); + pooler->setName("CometQEPooler"); + res->push_back(pooler); + return res; + } + Ptr newOptions; if(options->get("compute-similarity", false)) { newOptions = options->with("usage", use, @@ -175,6 +207,28 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti return res; } + if(use == usage::training || use == usage::scoring) { + if(type == "comet-qe") { + auto newOptions = options->with("usage", use); + auto res = New(newOptions); + + // For training, first rank in batch is class! + + auto enc1 = New(graph, newOptions->with("type", "transformer", "index", 1)); + enc1->setName("CometEncoder"); + res->push_back(enc1); + + auto enc2 = New(graph, newOptions->with("type", "transformer", "index", 2)); + enc2->setName("CometEncoder"); + res->push_back(enc2); + + auto pooler = New(graph, newOptions); + pooler->setName("CometQEPooler"); + res->push_back(pooler); + return res; + } + } + if(type == "s2s" || type == "amun" || type == "nematus") { return models::encoder_decoder(options->with( "usage", use, @@ -435,6 +489,8 @@ Ptr createCriterionFunctionFromOptions(Ptr options, return New(baseModel, New()); #endif #endif + else if (type == "comet-qe" && std::dynamic_pointer_cast(baseModel)) + return New(baseModel, New(options)); else if (std::dynamic_pointer_cast(baseModel)) return New(baseModel, New(options)); else diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index 6d4c4a95d..1b233bb1b 100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -36,6 +36,7 @@ template void Add, BinaryFunctor >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::gpu::Add, marian::functional::Assignee<2> > >, IntrusivePtr, IntrusivePtr >(marian::functional::UnaryFunctor, marian::functional::Assignee<2> > >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::gpu::Aggregate >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, IntrusivePtr >(marian::functional::UnaryFunctor >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); -template void marian::gpu::Add,marian::functional::UnaryFunctor > >,class IntrusivePtr,class IntrusivePtr >(marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,class IntrusivePtr,class IntrusivePtr,class IntrusivePtr); -template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void marian::gpu::Add,marian::functional::UnaryFunctor > >,class IntrusivePtr,class IntrusivePtr >(marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,class IntrusivePtr,class IntrusivePtr,class IntrusivePtr); +template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void marian::gpu::Add, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); \ No newline at end of file diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc index ba466d895..b983b7b7e 100644 --- a/src/tensors/gpu/add_all.inc +++ b/src/tensors/gpu/add_all.inc @@ -41,6 +41,7 @@ template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); #if COMPILE_FP16 template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -83,4 +84,5 @@ template void marian::AggregateAll<__half, float, marian::functional::UnaryFunct template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); #endif diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index edec0e1a7..730817849 100755 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -73,6 +73,8 @@ template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr, IntrusivePtr); + // How to add new specializations: // When you use a new specialization, it will cause a link error of this form (example): // .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element ( ... )' diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index cb95470f4..c160332e4 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -563,8 +563,7 @@ Ptr GraphGroup::collectStats(Ptr graph, size_t step = options_->get("mini-batch-fit-step"); size_t maxLength = options_->get("max-length"); - maxLength = (size_t)(std::ceil(maxLength / (float)step) * step); - + // this should be only one class label per line on input, hence restricting length to 1 std::vector localMaxes(numFiles, maxLength); auto inputTypes = options_->get>("input-types", {}); @@ -599,7 +598,11 @@ Ptr GraphGroup::collectStats(Ptr graph, // Do a binary search for maxmimum batch size that fits into given workspace memory // for a tested sentence length. - for(size_t i = step; i <= maxLength; i += step) { + // We round the maxLength to the next larger step to avoid a situation where we do not + // collect batch statistics for maximum length between steps. However, we do not exceed + // the actual maxLength even if the rounded value is larger. + size_t maxLengthRounded = (size_t)(std::ceil(maxLength / (float)step) * step); + for(size_t i = step; i <= maxLengthRounded; i += step) { size_t start = 1; size_t end = maxBatch; diff --git a/src/training/validator.cpp b/src/training/validator.cpp index ef1bac3db..cdc5ef5ac 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -1,4 +1,5 @@ #include "training/validator.h" +#include "embedder/vector_collector.h" namespace marian { @@ -37,6 +38,9 @@ std::vector*/>> Validators( } else if(metric == "bert-sentence-accuracy") { auto validator = New(vocabs, config, false); validators.push_back(validator); + } else if(metric == "embedding") { + auto validator = New(vocabs, config); + validators.push_back(validator); } else { ABORT("Unknown validation metric: {}", metric); } @@ -437,6 +441,115 @@ float TranslationValidator::validate(const std::vector>& gr return val; }; +/////////////////////////////////////////////////////////////////////////////////////// +EmbeddingValidator::EmbeddingValidator(std::vector> vocabs, Ptr options) + : Validator(vocabs, options, false), quiet_(options_->get("quiet-translation")) { + // @TODO: remove, only used for saving? + builder_ = models::createModelFromOptions(options_, models::usage::embedding); + + if(!options_->hasAndNotEmpty("valid-script-path")) + LOG_VALID(warn, "No post-processing script given for validating translator"); + + createBatchGenerator(/*isTranslating=*/true); +} + +float EmbeddingValidator::validate(const std::vector>& graphs, + Ptr state) { + using namespace data; + + // Generate batches + batchGenerator_->prepare(); + + std::vector> models; + for(auto graph : graphs) { + models.push_back(models::createModelFromOptions(options_, models::usage::embedding)); + graph->setInference(true); + } + + // Set up output file + std::string fileName; + Ptr tempFile; + + if(options_->hasAndNotEmpty("valid-translation-output")) { + fileName = options_->get("valid-translation-output"); + // fileName can be a template with fields for training state parameters: + fileName = state->fillTemplate(fileName); + } else { + tempFile.reset(new io::TemporaryFile(options_->get("tempdir"), false)); + fileName = tempFile->getFileName(); + } + + timer::Timer timer; + { + // @TODO: This can be simplified. If there is no "valid-translation-output", fileName already + // contains the name of temporary file that should be used? + auto output = options_->hasAndNotEmpty("valid-translation-output") + ? New(fileName) + : New(tempFile->getFileName()); + + std::deque> graphQueue(graphs.begin(), graphs.end()); + std::deque> modelQueue(models.begin(), models.end()); + auto task = [=, &graphQueue, &modelQueue](BatchPtr batch) { + thread_local Ptr graph; + thread_local Ptr builder; + + if(!graph) { + std::unique_lock lock(mutex_); + ABORT_IF(graphQueue.empty(), "Asking for graph, but none left on queue"); + graph = graphQueue.front(); + graphQueue.pop_front(); + + ABORT_IF(modelQueue.empty(), "Asking for scorer, but none left on queue"); + builder = modelQueue.front(); + modelQueue.pop_front(); + } + + auto embedder = std::dynamic_pointer_cast(builder); + auto corpusBatch = std::dynamic_pointer_cast(batch); + auto embeddings = cast(embedder->apply(graph, corpusBatch, /*clearGraph=*/true)[0], Type::float32); + + graph->forward(); + + std::vector sentVectors; + embeddings->val()->get(sentVectors); + + // collect embedding vector per sentence. + // if we compute similarities this is only one similarity per sentence pair. + for(size_t i = 0; i < batch->size(); ++i) { + auto embSize = embeddings->shape()[-1]; + auto beg = i * embSize; + auto end = (i + 1) * embSize; + std::vector sentVector(sentVectors.begin() + beg, sentVectors.begin() + end); + output->Write((long)batch->getSentenceIds()[i], sentVector); + } + }; + + threadPool_.reserve(graphs.size()); + TaskBarrier taskBarrier; + for(auto batch : *batchGenerator_) + taskBarrier.push_back(threadPool_.enqueue(task, batch)); + // ~TaskBarrier waits until all are done + } + + for(auto graph : graphs) + graph->setInference(false); + + float val = 0.0f; + + // Run post-processing script if given + if(options_->hasAndNotEmpty("valid-script-path")) { + // auto command = options_->get("valid-script-path") + " " + fileName; + // auto valStr = utils::exec(command); + auto valStr = utils::exec(options_->get("valid-script-path"), + options_->get>("valid-script-args"), + fileName); + val = (float)std::atof(valStr.c_str()); + updateStalled(graphs, val); + } + + return val; +}; + /////////////////////////////////////////////////////////////////////////////////////// SacreBleuValidator::SacreBleuValidator(std::vector> vocabs, Ptr options, const std::string& metric) : Validator(vocabs, options, /*lowerIsBetter=*/false), diff --git a/src/training/validator.h b/src/training/validator.h index 16bfd2457..d7580a500 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -359,6 +359,25 @@ class SacreBleuValidator : public Validator { bool quiet_{ false }; }; +// Validator that writes embeddings to a file and computes any metric specified with an external script +class EmbeddingValidator : public Validator { +public: + EmbeddingValidator(std::vector> vocabs, Ptr options); + virtual ~EmbeddingValidator() {} + + virtual float validate(const std::vector>& graphs, + Ptr state) override; + + std::string type() override { return "embed"; } + +protected: + bool quiet_{false}; + + virtual float validateBG(const std::vector>& /*graphs*/) override { + return 0; + } +}; + /** * @brief Creates validators from options * From 30f41daf96c1bb3e6c4e346f2f5d5dd7d4ab74bb Mon Sep 17 00:00:00 2001 From: Fai Sigalov Date: Thu, 16 Mar 2023 01:11:47 +0000 Subject: [PATCH 224/254] Merged PR 28460: Revert "Merged PR 26311: [FSM] make model loading lock non-static" locally I see this is causing a 5% regression in startup time, and we see a regression in prod as well. Revert "Merged PR 26311: [FSM] make model loading lock non-static" This reverts commit 4f145c450f2b4b956d175fbbfe118a90e494acf4. --- src/data/factored_vocab.cpp | 3 ++- src/data/factored_vocab.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp index f51869d56..caee2e0c3 100644 --- a/src/data/factored_vocab.cpp +++ b/src/data/factored_vocab.cpp @@ -21,7 +21,8 @@ namespace marian { maxSizeUnused; // If model has already been loaded, then assume this is a shared object, and skip loading it again. // This can be multi-threaded, so must run under lock. - std::lock_guard criticalSection(loadMtx_); + static std::mutex s_mtx; + std::lock_guard criticalSection(s_mtx); if (size() != 0) { //LOG(info, "[vocab] Attempting to load model a second time; skipping (assuming shared vocab)"); return size(); diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h index edbee1544..b644ce4c4 100644 --- a/src/data/factored_vocab.h +++ b/src/data/factored_vocab.h @@ -110,7 +110,6 @@ class FactoredVocab : public IVocab { Word unkId_{}; WordLUT vocab_; size_t lemmaSize_; - std::mutex loadMtx_; // factors char factorSeparator_ = '|'; // separator symbol for parsing factored words From 26b178c19cf71c3254046688eea66edc61f6ea36 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Fri, 17 Mar 2023 18:57:24 +0000 Subject: [PATCH 225/254] Merged PR 28179: comet2marian.py: download comet models automatically. --comet argument can be either a model path or model ID ``` --comet COMET, -c COMET COMET model path or an ID: wmt20-comet-qe-da, wmt20-comet-qe-da-v2, wmt21-comet-qe-mqm, wmt21-comet-qe-da ``` --- scripts/comet/comet2marian.py | 48 ++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 12 deletions(-) mode change 100644 => 100755 scripts/comet/comet2marian.py diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py old mode 100644 new mode 100755 index 9ddbb45c1..2a2ee7777 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -4,30 +4,54 @@ """ import argparse -import yaml +import logging as log import numpy as np +import yaml + +from pathlib import Path + +## Uncomment to see model names supported by your installed version of unbabel-comet +# from comet.models import available_metrics +# supported_comets = [m for m in available_metrics if 'qe' in m.lower()] +supported_comets = ['wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da'] +log.basicConfig(level=log.INFO) parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.') inputs = parser.add_mutually_exclusive_group(required=True) -inputs.add_argument('--comet', help='Path to COMET model') -inputs.add_argument('--roberta', help='Initialize with Roberta model', action='store_true') -parser.add_argument('--marian', help='Output path for Marian weight file', required=True) +inputs.add_argument('--roberta', '-r', help='Initialize with Roberta model', action='store_true') +inputs.add_argument('--comet', '-c', help=f'COMET model path or an ID: {", ".join(supported_comets)}') +parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True) parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true') args = parser.parse_args() -if args.roberta: +def load_from_huggingface(model_id): + log.info(f"Loading COMET model from huggingface {model_id}") from transformers import AutoModel + try: + model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) + except: + log.error(f"Could not resolve {model_id} from huggingface") + raise + return model.eval() + + +if args.roberta: # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large - robertaModel = AutoModel.from_pretrained("microsoft/infoxlm-large", add_pooling_layer=False) - robertaModel.eval() - print(robertaModel) - cometModel = robertaModel + cometModel = load_from_huggingface("microsoft/infoxlm-large") else: - from comet import load_from_checkpoint - cometModel = load_from_checkpoint(args.comet) + from comet import load_from_checkpoint, download_model + model_path = args.comet + if not Path(model_path).exists(): + if model_path not in supported_comets: + log.info(f"Could not find {model_path}") # maybe it's an invalid path + log.info(f"trying to resolve download {model_path}") + model_path = download_model(model_path) + log.info(f"Loading COMET model from checkpoint {model_path}") + cometModel = load_from_checkpoint(model_path) cometModel.eval() - print(cometModel) + +print(cometModel) marianModel = dict() From cd4d1ec49616288eadb87c15df4ec1566e1e3b1e Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 30 Mar 2023 07:17:55 +0000 Subject: [PATCH 226/254] Merged PR 28674: Add --early-stopping-epsilon param The new option `--early-stopping-epsilon` sets minimum required improvement to consider a consecutive validation score not a stalled one. You must set a single value or a separate epsilon for each validation metric. Negative values are allowed. Regression tests: https://github.com/marian-nmt/marian-regression-tests/pull/90 --- src/common/config_parser.cpp | 14 ++++--- src/common/config_validator.cpp | 5 +++ src/training/training.h | 4 +- src/training/validator.cpp | 68 +++++++++++++++++++-------------- src/training/validator.h | 41 ++++++++++++-------- 5 files changed, 81 insertions(+), 51 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index aaeeb514b..16d090897 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -602,14 +602,18 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { "Multiple metrics can be specified", {"cross-entropy"}); cli.add("--valid-reset-stalled", - "Reset stalled validation metrics when the training is restarted"); + "Reset stalled validation metrics when the training is restarted"); cli.add("--valid-reset-all", - "Reset all validation metrics when the training is restarted"); + "Reset all validation metrics when the training is restarted"); cli.add("--early-stopping", - "Stop if the first validation metric does not improve for arg consecutive validation steps", - 10); + "Stop if the first validation metric does not improve for arg consecutive validation steps", + 10); + cli.add>("--early-stopping-epsilon", + "An improvement lower than or equal to arg does not prevent stalled validation. " + "i-th value corresponds to i-th metric in --valid-metrics", + {0}); cli.add("--early-stopping-on", - "Decide if early stopping should take into account first, all, or any validation metrics" + "Decide if early stopping should take into account first, all, or any validation metrics. " "Possible values: first, all, any", "first"); diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index 6c6b002aa..5563b240d 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -141,6 +141,11 @@ void ConfigValidator::validateOptionsTraining() const { ABORT_IF(supportedStops.find(get("early-stopping-on")) == supportedStops.end(), "Supported options for --early-stopping-on are: first, all, any"); + // check if --early-stopping-epsilon is provided for each validation metric or is a single value + auto epsilons = get>("early-stopping-epsilon"); + ABORT_IF(epsilons.size() > 1 && epsilons.size() != get>("valid-metrics").size(), + "--early-stopping-epsilon must have as many values as there is --valid-metrics or only one"); + // validations for learning rate decaying ABORT_IF(get("lr-decay") > 1.f, "Learning rate decay factor greater than 1.0 is unusual"); diff --git a/src/training/training.h b/src/training/training.h index a5723f308..7f6176879 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -23,10 +23,10 @@ class Train : public ModelTask { void run() override { using namespace data; - + // MPI init should be first thing in training auto mpi = initMPI(/*multiThreaded=*/!options_->get("sync-sgd")); // @TODO: do we need the multiThreaded distinction at all? - + if(mpi) { // if we run MPI, then make sure to sync seed across processes as first action mpi->bCast(&Config::seed, 1, IMPIWrapper::getDataType(&Config::seed)); LOG(info, "Synced seed {}", Config::seed); diff --git a/src/training/validator.cpp b/src/training/validator.cpp index cdc5ef5ac..bd9068acf 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -1,49 +1,60 @@ #include "training/validator.h" #include "embedder/vector_collector.h" +#include + namespace marian { +static std::vector CE_METRICS + = {"cross-entropy", "ce-mean", "ce-sum", "ce-mean-words", "perplexity"}; + std::vector*/>> Validators( std::vector> vocabs, Ptr config) { std::vector*/>> validators; - auto validMetrics = config->get>("valid-metrics"); - - std::vector ceMetrics - = {"cross-entropy", "ce-mean", "ce-sum", "ce-mean-words", "perplexity"}; + auto epsilonsVec = config->get>("early-stopping-epsilon"); + std::deque epsilons(epsilonsVec.begin(), epsilonsVec.end()); + auto eps = epsilons.front(); + epsilons.pop_front(); + auto validMetrics = config->get>("valid-metrics"); for(auto metric : validMetrics) { - if(std::find(ceMetrics.begin(), ceMetrics.end(), metric) != ceMetrics.end()) { + if(std::find(CE_METRICS.begin(), CE_METRICS.end(), metric) != CE_METRICS.end()) { Ptr opts = New(*config); opts->set("cost-type", metric); - auto validator = New(vocabs, opts); + auto validator = New(vocabs, opts, eps); validators.push_back(validator); } else if(metric == "valid-script") { - auto validator = New(vocabs, config); + auto validator = New(vocabs, config, eps); validators.push_back(validator); } else if(metric == "translation") { - auto validator = New(vocabs, config); + auto validator = New(vocabs, config, eps); validators.push_back(validator); } else if(metric == "bleu" || metric == "bleu-detok" || metric == "bleu-segmented" || metric == "chrf") { - auto validator = New(vocabs, config, metric); + auto validator = New(vocabs, config, metric, eps); validators.push_back(validator); } else if(metric == "accuracy") { - auto validator = New(vocabs, config); + auto validator = New(vocabs, config, eps); validators.push_back(validator); } else if(metric == "bert-lm-accuracy") { - auto validator = New(vocabs, config, true); + auto validator = New(vocabs, config, true, eps); validators.push_back(validator); } else if(metric == "bert-sentence-accuracy") { - auto validator = New(vocabs, config, false); + auto validator = New(vocabs, config, false, eps); validators.push_back(validator); } else if(metric == "embedding") { - auto validator = New(vocabs, config); + auto validator = New(vocabs, config, eps); validators.push_back(validator); } else { ABORT("Unknown validation metric: {}", metric); } + + if(!epsilons.empty()) { + eps = epsilons.front(); + epsilons.pop_front(); + } } return validators; @@ -63,8 +74,8 @@ void ValidatorBase::actAfterLoaded(TrainingState& state) { } /////////////////////////////////////////////////////////////////////////////////////// -CrossEntropyValidator::CrossEntropyValidator(std::vector> vocabs, Ptr options) - : Validator(vocabs, options) { +CrossEntropyValidator::CrossEntropyValidator(std::vector> vocabs, Ptr options, float epsilon) + : Validator(vocabs, options, true, epsilon) { createBatchGenerator(/*isTranslating=*/false); auto opts = options_->with("inference", @@ -126,8 +137,8 @@ float CrossEntropyValidator::validateBG(const std::vector>& } /////////////////////////////////////////////////////////////////////////////////////// -AccuracyValidator::AccuracyValidator(std::vector> vocabs, Ptr options) - : Validator(vocabs, options, /*lowerIsBetter=*/false) { +AccuracyValidator::AccuracyValidator(std::vector> vocabs, Ptr options, float epsilon) + : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon) { createBatchGenerator(/*isTranslating=*/false); // @TODO: remove, only used for saving? @@ -200,8 +211,9 @@ float AccuracyValidator::validateBG(const std::vector>& gra /////////////////////////////////////////////////////////////////////////////////////// BertAccuracyValidator::BertAccuracyValidator(std::vector> vocabs, Ptr options, - bool evalMaskedLM) - : Validator(vocabs, options, /*lowerIsBetter=*/false), evalMaskedLM_(evalMaskedLM) { + bool evalMaskedLM, + float epsilon) + : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon), evalMaskedLM_(evalMaskedLM) { createBatchGenerator(/*isTranslating=*/false); // @TODO: remove, only used for saving? builder_ = models::createModelFromOptions(options_, models::usage::raw); @@ -295,8 +307,8 @@ float BertAccuracyValidator::validateBG(const std::vector>& } /////////////////////////////////////////////////////////////////////////////////////// -ScriptValidator::ScriptValidator(std::vector> vocabs, Ptr options) - : Validator(vocabs, options, false) { +ScriptValidator::ScriptValidator(std::vector> vocabs, Ptr options, float epsilon) + : Validator(vocabs, options, false, epsilon) { // @TODO: remove, only used for saving? builder_ = models::createModelFromOptions(options_, models::usage::raw); @@ -322,8 +334,8 @@ float ScriptValidator::validate(const std::vector>& graphs, } /////////////////////////////////////////////////////////////////////////////////////// -TranslationValidator::TranslationValidator(std::vector> vocabs, Ptr options) - : Validator(vocabs, options, false), quiet_(options_->get("quiet-translation")) { +TranslationValidator::TranslationValidator(std::vector> vocabs, Ptr options, float epsilon) + : Validator(vocabs, options, false, epsilon), quiet_(options_->get("quiet-translation")) { // @TODO: remove, only used for saving? builder_ = models::createModelFromOptions(options_, models::usage::translation); @@ -442,8 +454,8 @@ float TranslationValidator::validate(const std::vector>& gr }; /////////////////////////////////////////////////////////////////////////////////////// -EmbeddingValidator::EmbeddingValidator(std::vector> vocabs, Ptr options) - : Validator(vocabs, options, false), quiet_(options_->get("quiet-translation")) { +EmbeddingValidator::EmbeddingValidator(std::vector> vocabs, Ptr options, float epsilon) + : Validator(vocabs, options, false, epsilon), quiet_(options_->get("quiet-translation")) { // @TODO: remove, only used for saving? builder_ = models::createModelFromOptions(options_, models::usage::embedding); @@ -478,7 +490,7 @@ float EmbeddingValidator::validate(const std::vector>& grap tempFile.reset(new io::TemporaryFile(options_->get("tempdir"), false)); fileName = tempFile->getFileName(); } - + timer::Timer timer; { // @TODO: This can be simplified. If there is no "valid-translation-output", fileName already @@ -551,8 +563,8 @@ float EmbeddingValidator::validate(const std::vector>& grap }; /////////////////////////////////////////////////////////////////////////////////////// -SacreBleuValidator::SacreBleuValidator(std::vector> vocabs, Ptr options, const std::string& metric) - : Validator(vocabs, options, /*lowerIsBetter=*/false), +SacreBleuValidator::SacreBleuValidator(std::vector> vocabs, Ptr options, const std::string& metric, float epsilon) + : Validator(vocabs, options, /*lowerIsBetter=*/false, epsilon), metric_(metric), computeChrF_(metric == "chrf"), useWordIds_(metric == "bleu-segmented"), diff --git a/src/training/validator.h b/src/training/validator.h index d7580a500..aed710778 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -30,16 +30,18 @@ class ValidatorBase : public TrainingObserver { protected: bool lowerIsBetter_{true}; float lastBest_; + float epsilon_{0.f}; size_t stalled_{0}; std::mutex mutex_; ThreadPool threadPool_; public: - ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {} + ValidatorBase(bool lowerIsBetter, float epsilon = 0.f) + : lowerIsBetter_(lowerIsBetter), lastBest_(initScore()), epsilon_(epsilon) {} virtual ~ValidatorBase() {} - virtual float validate(const std::vector>& graphs, - Ptr state) = 0; + virtual float validate(const std::vector>& graphs, Ptr state) = 0; + virtual std::string type() = 0; float& lastBest() { return lastBest_; } @@ -53,8 +55,8 @@ template // @TODO: BuilderType doesn't really class Validator : public ValidatorBase { public: virtual ~Validator() {} - Validator(std::vector> vocabs, Ptr options, bool lowerIsBetter = true) - : ValidatorBase(lowerIsBetter), + Validator(std::vector> vocabs, Ptr options, bool lowerIsBetter = true, float epsilon = 0.f) + : ValidatorBase(lowerIsBetter, epsilon), vocabs_(vocabs), // options_ is a clone of global options, so it can be safely modified within the class options_(New(options->clone())) { @@ -119,13 +121,20 @@ class Validator : public ValidatorBase { void updateStalled(const std::vector>& graphs, float val) { - if((lowerIsBetter_ && lastBest_ > val) - || (!lowerIsBetter_ && lastBest_ < val)) { - stalled_ = 0; + if((lowerIsBetter_ && lastBest_ > val) || (!lowerIsBetter_ && lastBest_ < val)) { + // If epsilon is given, reset the stall count only if the improvement is greater than the epsilon + if(epsilon_ != 0.f && ((lowerIsBetter_ && lastBest_ - val < epsilon_) + || (!lowerIsBetter_ && val - lastBest_ < epsilon_))) { + stalled_++; + } else { + stalled_ = 0; + } lastBest_ = val; if(options_->get("keep-best")) keepBest(graphs); - } else /* if (lastBest_ != val) */ { // (special case 0 at start) @TODO: needed? Seems stall count gets reset each time it does improve. If not needed, remove "if(...)" again. + } else /* if (lastBest_ != val) */ { // (special case 0 at start) + // @TODO: needed? Seems stall count gets reset each time it does improve. + // If not needed, remove "if(...)" again. stalled_++; } } @@ -142,7 +151,7 @@ class CrossEntropyValidator : public Validator> vocabs, Ptr options); + CrossEntropyValidator(std::vector> vocabs, Ptr options, float epsilon = 0.f); virtual ~CrossEntropyValidator() {} std::string type() override { return options_->get("cost-type"); } @@ -154,7 +163,7 @@ class CrossEntropyValidator : public Validator { public: - AccuracyValidator(std::vector> vocabs, Ptr options); + AccuracyValidator(std::vector> vocabs, Ptr options, float epsilon = 0.f); virtual ~AccuracyValidator() {} std::string type() override { return "accuracy"; } @@ -168,7 +177,7 @@ class BertAccuracyValidator : public Validator { bool evalMaskedLM_{true}; public: - BertAccuracyValidator(std::vector> vocabs, Ptr options, bool evalMaskedLM); + BertAccuracyValidator(std::vector> vocabs, Ptr options, bool evalMaskedLM, float epsilon = 0.f); virtual ~BertAccuracyValidator() {} std::string type() override { @@ -185,7 +194,7 @@ class BertAccuracyValidator : public Validator { class ScriptValidator : public Validator { public: - ScriptValidator(std::vector> vocabs, Ptr options); + ScriptValidator(std::vector> vocabs, Ptr options, float epsilon = 0.f); virtual ~ScriptValidator() {} virtual float validate(const std::vector>& graphs, @@ -202,7 +211,7 @@ class ScriptValidator : public Validator { // validator that translates and computes BLEU (or any metric) with an external script class TranslationValidator : public Validator { public: - TranslationValidator(std::vector> vocabs, Ptr options); + TranslationValidator(std::vector> vocabs, Ptr options, float epsilon = 0.f); virtual ~TranslationValidator() {} virtual float validate(const std::vector>& graphs, @@ -223,7 +232,7 @@ class TranslationValidator : public Validator { // @TODO: combine with TranslationValidator (above) to avoid code duplication class SacreBleuValidator : public Validator { public: - SacreBleuValidator(std::vector> vocabs, Ptr options, const std::string& metric); + SacreBleuValidator(std::vector> vocabs, Ptr options, const std::string& metric, float epsilon = 0.f); virtual ~SacreBleuValidator() {} virtual float validate(const std::vector>& graphs, @@ -362,7 +371,7 @@ class SacreBleuValidator : public Validator { // Validator that writes embeddings to a file and computes any metric specified with an external script class EmbeddingValidator : public Validator { public: - EmbeddingValidator(std::vector> vocabs, Ptr options); + EmbeddingValidator(std::vector> vocabs, Ptr options, float epsilon = 0.f); virtual ~EmbeddingValidator() {} virtual float validate(const std::vector>& graphs, From a42147675148c89e909135f6325e5bcdde7a5e8f Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 13 Apr 2023 18:30:45 +0000 Subject: [PATCH 227/254] Merged PR 28502: Comet2Marian: add --spm argument to download vocabulary file Adds --spm argument to download vocabulary file in comet2marian.py conversion script. --- scripts/comet/comet2marian.py | 52 +++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index 2a2ee7777..8ef4d29fc 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -22,36 +22,64 @@ inputs.add_argument('--comet', '-c', help=f'COMET model path or an ID: {", ".join(supported_comets)}') parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True) parser.add_argument('-s', '--add_sigmoid', help='Add final sigmoid if not already present', action='store_true') +parser.add_argument('--spm', '-spm', type=Path, help='Save tokenizer SPM file here', required=False) args = parser.parse_args() def load_from_huggingface(model_id): - log.info(f"Loading COMET model from huggingface {model_id}") - from transformers import AutoModel + log.info(f"Loading transformer model from huggingface {model_id}") + from transformers import AutoModel, AutoTokenizer try: - model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) + model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) + AutoTokenizer.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + return model.eval(), getattr(tokenizer, 'vocab_file', None) except: log.error(f"Could not resolve {model_id} from huggingface") raise - return model.eval() -if args.roberta: - # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large - cometModel = load_from_huggingface("microsoft/infoxlm-large") -else: +def load_comet_model(model_path): from comet import load_from_checkpoint, download_model - model_path = args.comet + from transformers import AutoTokenizer + if not Path(model_path).exists(): if model_path not in supported_comets: log.info(f"Could not find {model_path}") # maybe it's an invalid path log.info(f"trying to resolve download {model_path}") model_path = download_model(model_path) log.info(f"Loading COMET model from checkpoint {model_path}") - cometModel = load_from_checkpoint(model_path) - cometModel.eval() + comet_model = load_from_checkpoint(model_path) + comet_model.eval() + + vocab_file = None + try: + pretrained_model = comet_model.hparams.get('pretrained_model') + log.info(f"comet: {model_path}; pretrained: {pretrained_model}") + if pretrained_model: + tokenizer = AutoTokenizer.from_pretrained(pretrained_model) + vocab_file = getattr(tokenizer, 'vocab_file', None) + except Exception as e: + log.warning(f'Error while locating vocab file: {e}') + pass + return comet_model, vocab_file + +if args.roberta: + # Load the model that Unbabel based COMET on: https://huggingface.co/microsoft/infoxlm-large + cometModel, vocab_file = load_from_huggingface("microsoft/infoxlm-large") +else: + cometModel, vocab_file = load_comet_model(args.comet) + +if args.spm: + vocab_file = vocab_file and Path(vocab_file) + if vocab_file and vocab_file.exists(): + if not args.spm.parent.exists(): + raise Exception(f"Directory {args.spm.parent} does not exist") + log.info(f"Copying {vocab_file} to {args.spm}") + args.spm.write_bytes(vocab_file.read_bytes()) + else: + raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually") -print(cometModel) marianModel = dict() From cd78417721cbab2e95ad86bc94f9d3d976e9c662 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:02:01 +0100 Subject: [PATCH 228/254] Bump examples from `58f48a0` to `6c40475` (#987) Bumps [examples](https://github.com/marian-nmt/marian-examples) from `58f48a0` to `6c40475`. - [Release notes](https://github.com/marian-nmt/marian-examples/releases) - [Commits](https://github.com/marian-nmt/marian-examples/compare/58f48a06756c623fe799613134810322e061863f...6c40475a9cbdcc219d0b6a8347ae43902204eedc) --- updated-dependencies: - dependency-name: examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples b/examples index 58f48a067..6c40475a9 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 58f48a06756c623fe799613134810322e061863f +Subproject commit 6c40475a9cbdcc219d0b6a8347ae43902204eedc From 1334fa51e6ce73c6d66479ed20c77b3cfb78c94a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 14 Apr 2023 21:03:04 +0100 Subject: [PATCH 229/254] Bump regression-tests from `2a8bed3` to `89ce02e` (#984) Bumps [regression-tests](https://github.com/marian-nmt/marian-regression-tests) from `2a8bed3` to `89ce02e`. - [Release notes](https://github.com/marian-nmt/marian-regression-tests/releases) - [Commits](https://github.com/marian-nmt/marian-regression-tests/compare/2a8bed3f0e937a9de2d6fa92dee3bcf482d3d47b...89ce02e3a3e5786d7ae7802108f6a0288f70c269) --- updated-dependencies: - dependency-name: regression-tests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- regression-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-tests b/regression-tests index 2a8bed3f0..89ce02e3a 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 2a8bed3f0e937a9de2d6fa92dee3bcf482d3d47b +Subproject commit 89ce02e3a3e5786d7ae7802108f6a0288f70c269 From 8bf101c43b90b19f922e8b35f20eaff1be514078 Mon Sep 17 00:00:00 2001 From: Nikita Hrytsai Date: Fri, 14 Apr 2023 23:04:21 +0300 Subject: [PATCH 230/254] Fix include path typo in onnx exporter (#978) --- src/onnx/expression_graph_onnx_exporter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/onnx/expression_graph_onnx_exporter.cpp b/src/onnx/expression_graph_onnx_exporter.cpp index d27f1360c..8e6625a42 100644 --- a/src/onnx/expression_graph_onnx_exporter.cpp +++ b/src/onnx/expression_graph_onnx_exporter.cpp @@ -5,7 +5,7 @@ #include "models/model_factory.h" #include "models/encoder_decoder.h" #include "data/corpus_base.h" -#include "tensors/cpu/fbgemm/expression_graph_packable.h" +#include "tensors/cpu/expression_graph_packable.h" #include From 3daf4ee2906583dfc86e4f6986b40f843e7e3a3c Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Sat, 15 Apr 2023 00:25:50 -0700 Subject: [PATCH 231/254] quote CPUINFO in cmake (#983) Without these quotes, cmake fails in a confusing manner on systems whose cpuinfo output includes spaces. This arose in the context of attempting to compile natively on an m1 mac. $ /usr/sbin/sysctl -n machdep.cpu.features machdep.cpu.leaf7_features sysctl: unknown oid 'machdep.cpu.leaf7_features' Obviously, this didn't work out well; there is still much more to do. Still, the quotes are cheap and eliminate a confusing failure mode. For this reason, I added them to the linux as well as the darwin path. --- cmake/FindSSE.cmake | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cmake/FindSSE.cmake b/cmake/FindSSE.cmake index e1c58fbc9..0f1483487 100644 --- a/cmake/FindSSE.cmake +++ b/cmake/FindSSE.cmake @@ -4,7 +4,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) - STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") @@ -13,14 +13,14 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ENDIF (SSE2_TRUE) # /proc/cpuinfo apparently omits sse3 :( - STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE) IF (NOT SSE3_TRUE) - STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE) ENDIF (NOT SSE3_TRUE) - STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE) IF (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") @@ -33,7 +33,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) - STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") @@ -41,7 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) - STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE) IF (SSE42_TRUE) set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host") @@ -49,7 +49,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host") ENDIF (SSE42_TRUE) - STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx" "${SSE_THERE}" AVX_TRUE) IF (AVX_TRUE) set(AVX_FOUND true CACHE BOOL "AVX available on host") @@ -57,7 +57,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(AVX_FOUND false CACHE BOOL "AVX available on host") ENDIF (AVX_TRUE) - STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx2" "${SSE_THERE}" AVX2_TRUE) IF (AVX2_TRUE) set(AVX2_FOUND true CACHE BOOL "AVX2 available on host") @@ -65,7 +65,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(AVX2_FOUND false CACHE BOOL "AVX2 available on host") ENDIF (AVX2_TRUE) - STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE) IF (AVX512_TRUE) set(AVX512_FOUND true CACHE BOOL "AVX512 available on host") @@ -76,7 +76,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features machdep.cpu.leaf7_features" OUTPUT_VARIABLE CPUINFO) - STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") @@ -84,7 +84,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) - STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE) IF (SSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") @@ -100,7 +100,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) - STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") @@ -108,7 +108,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) - STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "AVX" "${SSE_THERE}" AVX_TRUE) IF (AVX_TRUE) set(AVX_FOUND true CACHE BOOL "AVX available on host") @@ -116,7 +116,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(AVX_FOUND false CACHE BOOL "AVX available on host") ENDIF (AVX_TRUE) - STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "AVX2" "${SSE_THERE}" AVX2_TRUE) IF (AVX2_TRUE) set(AVX2_FOUND true CACHE BOOL "AVX2 available on host") @@ -124,7 +124,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(AVX2_FOUND false CACHE BOOL "AVX2 available on host") ENDIF (AVX2_TRUE) - STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE) IF (AVX512_TRUE) set(AVX512_FOUND true CACHE BOOL "AVX512 available on host") From d054dc844f7e748ca04c04deead4f8b58057a217 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Jun 2023 10:53:58 +0100 Subject: [PATCH 232/254] Bump src/3rd_party/fbgemm from `6f45243` to `0e33146` (#995) Bumps [src/3rd_party/fbgemm](https://github.com/marian-nmt/FBGEMM) from `6f45243` to `0e33146`. - [Commits](https://github.com/marian-nmt/FBGEMM/compare/6f45243cb8ab7d7ab921af18d313ae97144618b8...0e33146d3e7f070c7de9494efef49147a9d20558) --- updated-dependencies: - dependency-name: src/3rd_party/fbgemm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/3rd_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/fbgemm b/src/3rd_party/fbgemm index 6f45243cb..0e33146d3 160000 --- a/src/3rd_party/fbgemm +++ b/src/3rd_party/fbgemm @@ -1 +1 @@ -Subproject commit 6f45243cb8ab7d7ab921af18d313ae97144618b8 +Subproject commit 0e33146d3e7f070c7de9494efef49147a9d20558 From 02678ef37a8f4f35fc30c4b21cdd0e31fdd5442e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 19 Jun 2023 15:51:25 +0000 Subject: [PATCH 233/254] Merged PR 29868: Add option to replace current parameters with smoothed version during training Adds option to replace current parameters with smoothed version during training. Could potentially help with convergence and training stability. --- src/common/config_parser.cpp | 3 +++ src/optimizers/optimizers.cpp | 22 +++++++++++++++++++++- src/optimizers/optimizers.h | 4 ++++ src/training/graph_group.cpp | 18 ++++++++++++++++++ src/training/graph_group.h | 6 ++++++ src/training/graph_group_sync.cpp | 6 +++++- src/training/scheduler.h | 7 +++++++ 7 files changed, 64 insertions(+), 2 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 16d090897..e24709f6a 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -517,6 +517,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. " "Auto-adjusted to --mini-batch-words-ref if given.", 0.f)->implicit_val("1e-4"); + cli.add("--exponential-smoothing-replace-freq", + "When exponential-smoothing is enabled replace master parameters with smoothed parameters once every n steps (possible units u=updates, t=target labels, e=epochs)", + "0"); cli.add("--guided-alignment", "Path to a file with word alignments. Use guided alignment to guide attention or 'none'. " "If --tsv it specifies the index of a TSV field that contains the alignments (0-based)", diff --git a/src/optimizers/optimizers.cpp b/src/optimizers/optimizers.cpp index f54276e18..d53e46eef 100644 --- a/src/optimizers/optimizers.cpp +++ b/src/optimizers/optimizers.cpp @@ -109,7 +109,7 @@ void OptimizerBase::swapWithSmoothed(Tensor params) { if(castOptimizerType_) { // If true then optimizer type is different from the graph type, // hence a parameter master copy exists and we swap with the master copy. - // We then from optimizer parameter type to graph parameter type + // We then copy and cast from optimizer parameter type to graph parameter type pm_->swap(avg_); CopyCast(params, pm_); } else { @@ -121,6 +121,26 @@ void OptimizerBase::swapWithSmoothed(Tensor params) { } } +void OptimizerBase::replaceWithSmoothed(Tensor params) { + if(!mvAvg_) // no smoothing, don't do anything + return; + + // This function will overwrite the original parameters which are then lost. + if(castOptimizerType_) { + // If true then optimizer type is different from the graph type, + // hence a parameter master copy exists and we copy to the master copy. + // We then copy and cast from optimizer parameter type to graph parameter type + pm_->copyFrom(avg_); + CopyCast(params, pm_); + } else { + // Types are equal hence there is no parameter master copy. This means + // we need to do a proper copy from the graph params to the smoothed + // version. + params->copyFrom(avg_); + } +} + + void OptimizerBase::load(std::vector& items, const std::vector>& opts, const std::vector>& backends, diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h index e7e8c8ed1..2c7128c51 100644 --- a/src/optimizers/optimizers.h +++ b/src/optimizers/optimizers.h @@ -114,6 +114,10 @@ class OptimizerBase : public TrainingObserver, public ExponentialSmoothing { // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled). // Usually we will call this twice, to swap in and to swap out. void swapWithSmoothed(Tensor params); + + // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled). + // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. + void replaceWithSmoothed(Tensor params); // return stateful optimizer shards, for base that's only averaged parameters virtual std::vector getShards() { diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index c160332e4..367e47e16 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -526,6 +526,24 @@ void GraphGroup::swapWithSmoothed() { barrier(); } +void GraphGroup::replaceWithSmoothed() { + if(isMainProcess()) + LOG(info, "Replacing master parameters with smoothed parameters"); + + auto replace = [&](size_t i, size_t begin, size_t end) { + auto curParam = graphs_[i]->params()->vals()->subtensor(begin, end-begin); + optimizerShards_[i]->replaceWithSmoothed(curParam); + return true; // dummy success + }; + comm_->foreach(replace); + comm_->allGatherParams(); + + if(shardingMode_ == ShardingMode::local) + comm_->broadcastParams(); + + barrier(); +} + void GraphGroup::validate() { //@TODO: rename this function to something less confusing. ABORT_IF(finalized_, "Training has already finished."); } diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 0895caa77..d7525a102 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -114,8 +114,14 @@ class GraphGroup { const OptimizerBase::GatherStateFunc& gatherFn); public: + // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled). + // Usually we will call this twice, to swap in and to swap out. void swapWithSmoothed(); + // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled). + // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. + void replaceWithSmoothed(); + bool isMainProcess() const { return mpi_->isMainProcess(); } // (we need this test a few times) void barrier() const { mpi_->barrier(); } // (we need this several times) diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index a3eee8a7b..b97845814 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -348,7 +348,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num if(scheduler_->syncing()) { syncParametersAndShards(); } - + // save intermediate model (and optimizer state) to file if(scheduler_->saving()) { save(); @@ -361,6 +361,10 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num scheduler_->validate(graphs_); swapWithSmoothed(); } + + if(scheduler_->replacingWithSmoothed()) { + replaceWithSmoothed(); + } } if(saneGradient) diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 30f8c8de7..b6ac1df79 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -286,6 +286,13 @@ class Scheduler : public TrainingObserver { return state_->enteredNewPeriodOf(options_->get("sync-freq", "0")); } + bool replacingWithSmoothed() { + if(options_->get("exponential-smoothing", 0.f) != 0.f) + return state_->enteredNewPeriodOf(options_->get("exponential-smoothing-replace-freq", "0")); + else + return false; + } + void validate(const std::vector>& graphs, bool isFinal = false) { // Do not validate if already validated (for instance, after the model is loaded) From 7425c0261c56c1dab4a026b6c08a134a063fcaaf Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 27 Jun 2023 19:56:58 +0000 Subject: [PATCH 234/254] Merged PR 30009: Divergence detection and fallback to fp32 if training with fp16 fails This PR adds a do-while loop to training. It should only repeat if a fp16 training run was interrupted via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and --fp16-fallback-to-fp32 are enabled. The repeated training run will continue from last checkpoint (similar to a manually interrupted training) but attempt training in fp32. If that training run or any other fp32 training happens to diverge, training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error. --- CHANGELOG.md | 2 + VERSION | 2 +- src/common/config_parser.cpp | 6 ++ src/training/scheduler.h | 162 ++++++++++++++++++++++------- src/training/training.h | 190 +++++++++++++++++++++------------- src/training/training_state.h | 14 +++ 6 files changed, 264 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6aff5037f..a2a9a9bdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) + diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown. - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian. - Validator that generates embeddings and can be used during COMET training with an external script. - New experimental layer framework for Transformer-like models. diff --git a/VERSION b/VERSION index 00f862625..21decde5d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.3 +v1.12.4 diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index e24709f6a..d70048fe9 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -559,6 +559,12 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") ->implicit_val("8.f 10000 1.f 8.f"); + cli.add>("--throw-on-divergence", + "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps " + "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations") + ->implicit_val("100 10 3.0f"); + cli.add("--fp16-fallback-to-fp32", + "If fp16 training diverges and throws try to continue training with fp32 precision"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", diff --git a/src/training/scheduler.h b/src/training/scheduler.h index b6ac1df79..9c84d1593 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -9,6 +9,18 @@ namespace marian { +/** + * This exception gets thrown when a training run divergence was detected. See below in main update function. +*/ +class DivergenceException : public std::runtime_error { +public: + DivergenceException(float averageSlow, float averageFast, float sigmas) + : std::runtime_error(fmt::format( + "Detected training divergence: slow-moving average loss {:.4f} exceeded by fast-moving average loss {:.4f} by {:.4f} = {:.4f} * sigmas", + averageSlow, averageFast, averageFast - averageSlow, sigmas)) + {} +}; + class Scheduler : public TrainingObserver { private: Ptr options_; @@ -17,6 +29,12 @@ class Scheduler : public TrainingObserver { Ptr mpi_; bool first_{true}; // true if this is the first update after renewing the training + + bool throwOnDivergence_{false}; // throw an exception if training divergence is detected + size_t lossAvgWindowSlow_{100}; // window size for slow-moving average loss for divergence detection + size_t lossAvgWindowFast_{10}; // window size for fast-moving average loss for divergence detection + float divergenceTolerance_{3.f}; // tolerance for divergence detection as multiples of standard deviation + size_t gradientNormAvgWindow_{100}; // window size for recording the exponential average of gradient norms, after this many updates about 90% of the mass comes from this many last updates SchedulingParameter logicalEpoch_; size_t logicalEpochWidth_{0}; @@ -134,6 +152,21 @@ class Scheduler : public TrainingObserver { : options_(options), state_(state), mpi_(mpi), gradientNormAvgWindow_(options_->get("gradient-norm-average-window", 100)) { + auto throwParameters = options_->get>("throw-on-divergence"); + if(!throwParameters.empty()) { + throwOnDivergence_ = true; + if(throwParameters.size() > 0) + lossAvgWindowSlow_ = std::stoul(throwParameters[0]); + if(throwParameters.size() > 1) + lossAvgWindowFast_ = std::stoul(throwParameters[1]); + if(throwParameters.size() > 2) + divergenceTolerance_ = std::stof(throwParameters[2]); + LOG(info, + "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps " + "vs fast-moving window over {} steps with tolerance of {} sigmas", + lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_); + } + // parse logical-epoch parameters auto logicalEpochStr = options->get>("logical-epoch", {"1e", "0"}); ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?"); @@ -405,27 +438,84 @@ class Scheduler : public TrainingObserver { // -freq parameters do not support epoch units state_->validated = false; - // Since batchLabels is counted across all MPI processes, we also should temporarily - // extrapolate cost across MPI processes, to have numbers in the right range. - // When doing the actual log, we then aggregate across MPI processes to get the accurate number. + // collect costs from all nodes if training with MPI if(mpi_) { - rationalLoss.loss *= mpi_->numMPIProcesses(); - rationalLoss.count *= mpi_->numMPIProcesses(); + mpi_->allReduce(&rationalLoss.loss, &rationalLoss.loss, 1, MPI_FLOAT, MPI_SUM); + mpi_->allReduce(&rationalLoss.count, &rationalLoss.count, 1, MPI_FLOAT, MPI_SUM); } + float currentNormalizedLoss = rationalLoss.loss / rationalLoss.count; - // @BUGBUG: rationalLoss.count is float, not a count. Possible solution: make (costSum, costCount) a StaticLoss object as well - state_->costSum += rationalLoss.loss; // aggregate sum cost since last display - state_->costCount += rationalLoss.count; // cost gets normalized w.r.t. this in display + state_->costSum += rationalLoss.loss; + state_->costCount += rationalLoss.count; state_->updatesDisp += 1; state_->samplesDisp += batchSize; state_->wordsDisp += batchLabels; // words at given input processed since last display, for speed display state_->samplesEpoch += batchSize; // sentences processed in this epoch - state_->labelsTotal += batchLabels; // total labels processed + state_->labelsTotal += batchLabels; // total labels processed state_->newUpdate(numReadBatches); + // true if --throw-on-divergence [lossAvgWindowSlow_] [lossAvgWindowFast_] [divergenceTolerance_] is enabled, false otherwise + if(throwOnDivergence_) { + size_t windowSlow = std::min(lossAvgWindowSlow_, state_->batches); // we compare the running exponential average over a longer window + size_t windowFast = std::min(lossAvgWindowFast_, state_->batches); // with the running exponential everage over a shorter window (for smoothing) + + // By default we set windowSlow = 100 and windowFast = 10, so if values diverge the average from the shorter window should pick this up quickly + // vs the longer window while still smoothing over multiple updates avoiding detecting random single spikes as divergence. + float alphaSlow = 2.f / (float)(windowSlow + 1); // about 90% of the mass will come from the windowSlow last steps + float alphaFast = 2.f / (float)(windowFast + 1); // about 90% of the mass will come from the windowFast last steps + + // set some reasonable defaults during training start. Cost shouldn't be zero unless fresh start without *.progress.yml + if(state_->lossAvgSlow == 0) { + state_->lossAvgSlow = currentNormalizedLoss; + state_->lossAvgFast = currentNormalizedLoss; + state_->lossVarSlow = 0; + } + + // allow statistics to see at least lossAvgWindowSlow_ updates before using for divergence detection + if(state_->batches > lossAvgWindowSlow_) { + // we compare the faster moving average against the slower moving exponential loss average + float delta = state_->lossAvgFast - state_->lossAvgSlow; + // running standard deviation + float sigma = std::sqrt(state_->lossVarSlow); + + // negative delta is always safe (indicates convergence) and sigma should always be larger than zero (safe for division) after a few first steps + if(delta > 0 && sigma > 0) { + // how many standard deviations (sigmas) above slow-moving average? + float sigmasDiverged = delta / sigma; + if(sigmasDiverged > divergenceTolerance_) { // uh-oh - by default assume training diverged if slow-moving average is exceeded by e.g. 3 sigmas + LOG(warn, + "Detected training divergence: slow-moving average loss {:.4f} exceeded by fast-moving average loss {:.4f} by {:.4f} = {:.4f} * sigmas", + state_->lossAvgSlow, state_->lossAvgFast, delta, sigmasDiverged); + + // this gets propagated to the main training loop in training/training.h and will either fail the whole training process with + // an unhandled exception (thus exiting with error code) or trigger another training run with fallback to fp32 if we were + // training with fp16 and --fp16-fallback-to-fp32 is enabled. + throw DivergenceException(state_->lossAvgSlow, state_->lossAvgFast, sigmasDiverged); + } + } + + if(state_->enteredNewPeriodOf(options_->get("disp-freq")) || state_->batches <= options_->get("disp-first")) { + if(!mpi_ || mpi_->isMainProcess()) { + LOG(debug, + "delta(={:.4f}) = avgFast(={:.4f}) - avgSlow(={:.4f}) = {:.4f} * sigma(={:.4f}) < {:.4f} * sigma", + delta, state_->lossAvgFast, state_->lossAvgSlow, delta / sigma, sigma, divergenceTolerance_); + } + } + } + + // log slow-moving exponential average and variance of training cost stats + float deltaSlow = currentNormalizedLoss - state_->lossAvgSlow; + state_->lossAvgSlow = state_->lossAvgSlow + alphaSlow * deltaSlow; + state_->lossVarSlow = (1.0f - alphaSlow) * (state_->lossVarSlow + alphaSlow * deltaSlow * deltaSlow); + + // log fast-moving exponential average of training cost stats + float deltaFast = currentNormalizedLoss - state_->lossAvgFast; + state_->lossAvgFast = state_->lossAvgFast + alphaFast * deltaFast; + } + if(gradientNorm) { size_t range = std::min(gradientNormAvgWindow_, state_->batches); float alpha = 2.f / (float)(range + 1); @@ -445,38 +535,30 @@ class Scheduler : public TrainingObserver { if(state_->enteredNewPeriodOf(options_->get("disp-freq")) || state_->batches <= options_->get("disp-first")) { // if MPI then aggregate precise cost across workers - if(mpi_) { - state_->costSum /= mpi_->numMPIProcesses(); // undo the extra scaling - state_->costCount /= mpi_->numMPIProcesses(); // undo the extra scaling - mpi_->allReduce(&state_->costSum, &state_->costSum, 1, MPI_FLOAT, MPI_SUM); - mpi_->allReduce(&state_->costCount, &state_->costCount, 1, MPI_FLOAT, MPI_SUM); - } - - if(mpi_ && mpi_->myMPIRank() != 0) { - // skip the report on alternate worker processes - } else if(options_->get("lr-report")) { - LOG(info, - "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f} : L.r. {:.4e}", - formatLogicalEpoch(), - state_->batches, - utils::withCommas(state_->samplesEpoch), - formatLoss(lossType, dispLabelCounts, batchLabels, state_), - timer_.elapsed(), - state_->wordsDisp / timer_.elapsed(), - state_->gradientNormAvg, - state_->eta); - } else { - LOG(info, - "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f}", - formatLogicalEpoch(), - state_->batches, - utils::withCommas(state_->samplesEpoch), - formatLoss(lossType, dispLabelCounts, batchLabels, state_), - timer_.elapsed(), - state_->wordsDisp / timer_.elapsed(), - state_->gradientNormAvg); + if(!mpi_ || mpi_->isMainProcess()) { + if(options_->get("lr-report")) { + LOG(info, + "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f} : L.r. {:.4e}", + formatLogicalEpoch(), + state_->batches, + utils::withCommas(state_->samplesEpoch), + formatLoss(lossType, dispLabelCounts, batchLabels, state_), + timer_.elapsed(), + state_->wordsDisp / timer_.elapsed(), + state_->gradientNormAvg, + state_->eta); + } else { + LOG(info, + "Ep. {} : Up. {} : Sen. {} : {} : Time {:.2f}s : {:.2f} words/s : gNorm {:.4f}", + formatLogicalEpoch(), + state_->batches, + utils::withCommas(state_->samplesEpoch), + formatLoss(lossType, dispLabelCounts, batchLabels, state_), + timer_.elapsed(), + state_->wordsDisp / timer_.elapsed(), + state_->gradientNormAvg); + } } - timer_.start(); state_->costSum = 0; state_->costCount = 0; diff --git a/src/training/training.h b/src/training/training.h index 7f6176879..cbca3eff2 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -45,78 +45,126 @@ class Train : public ModelTask { dataset->prepare(); - Ptr stats; - if(options_->get("mini-batch-fit")) { - LOG(info, - "[batching] Collecting statistics for batch fitting with step size {}", - options_->get("mini-batch-fit-step")); - // @TODO this should receive a function object that can generate a fake batch; - // that way vocabs would not be exposed. - auto model = New(options_, mpi); - - // use temporary scheduler to make sure everything gets destroyed properly - // otherwise the scheduler believes that registered objects still exist - auto tempTrainState = New(options_->get("learn-rate")); - auto tempScheduler = New(options_, tempTrainState, mpi); - - model->setScheduler(tempScheduler); // collectStats() needs to know about dynamic MB scaling - stats = model->collectStats(dataset->getVocabs()); - LOG(info, "[batching] Done. Typical MB size is {} target words", utils::withCommas(stats->estimateTypicalTrgWords())); - } - - auto trainState = New(options_->get("learn-rate")); - auto scheduler = New(options_, trainState, mpi); - - if((options_->hasAndNotEmpty("valid-sets") || options_->hasAndNotEmpty("valid-script-path")) - && SchedulingParameter::parse(options_->get("valid-freq"))) { - for(auto validator : Validators(dataset->getVocabs(), options_)) - scheduler->addValidator(validator); - } - - auto batchGenerator = New(dataset, options_, stats); - - scheduler->registerTrainingObserver(batchGenerator); - - auto model = New(options_, mpi); - model->setScheduler(scheduler); - model->setTypicalTrgBatchWords(batchGenerator->estimateTypicalTrgBatchWords()); // needed for dynamic MB scaling - model->load(); - - bool restored = !options_->get("no-restore-corpus") - && batchGenerator->restore(trainState); - - // We only want custom behavior once training starts. - installCustomSignalHandlers(); - - // -- main training loop - scheduler->started(); - while(scheduler->keepGoing()) { - if(!restored) - batchGenerator->prepare(); - restored = false; - - // main training loop for one epoch - for(auto batch : *batchGenerator) { - if (!scheduler->keepGoing()) - break; - model->update(batch); + // We run training in a do-while loop. It should only restart if a fp16 training run was interrupted + // via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and + // --fp16-fallback-to-fp32 are enabled. + // The repeated training run will continue from last checkpoint (similar to a manually interrupted training) + // but attempt training in fp32. If that training run or any other fp32 training happens to diverge, + // training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error. + bool restartTraining; + do { + try { + // there will be only one training loop execution unless in special situations, + // for example, when fp16 training diverges and it is restarted with fp32 + restartTraining = false; + + Ptr stats; + if(options_->get("mini-batch-fit")) { + LOG(info, + "[batching] Collecting statistics for batch fitting with step size {}", + options_->get("mini-batch-fit-step")); + // @TODO this should receive a function object that can generate a fake batch; + // that way vocabs would not be exposed. + auto model = New(options_, mpi); + + // use temporary scheduler to make sure everything gets destroyed properly + // otherwise the scheduler believes that registered objects still exist + auto tempTrainState = New(options_->get("learn-rate")); + auto tempScheduler = New(options_, tempTrainState, mpi); + + model->setScheduler(tempScheduler); // collectStats() needs to know about dynamic MB scaling + stats = model->collectStats(dataset->getVocabs()); + LOG(info, "[batching] Done. Typical MB size is {} target words", utils::withCommas(stats->estimateTypicalTrgWords())); + } + + auto trainState = New(options_->get("learn-rate")); + auto scheduler = New(options_, trainState, mpi); + + if((options_->hasAndNotEmpty("valid-sets") || options_->hasAndNotEmpty("valid-script-path")) + && SchedulingParameter::parse(options_->get("valid-freq"))) { + for(auto validator : Validators(dataset->getVocabs(), options_)) + scheduler->addValidator(validator); + } + + auto batchGenerator = New(dataset, options_, stats); + + scheduler->registerTrainingObserver(batchGenerator); + + auto model = New(options_, mpi); + model->setScheduler(scheduler); + model->setTypicalTrgBatchWords(batchGenerator->estimateTypicalTrgBatchWords()); // needed for dynamic MB scaling + model->load(); + + bool restored = !options_->get("no-restore-corpus") + && batchGenerator->restore(trainState); + + // We only want custom behavior once training starts. + installCustomSignalHandlers(); + + // -- main training loop + scheduler->started(); + while(scheduler->keepGoing()) { + if(!restored) + batchGenerator->prepare(); + restored = false; + + // main training loop for one epoch + for(auto batch : *batchGenerator) { + if (!scheduler->keepGoing()) + break; + model->update(batch); + } + + if(scheduler->keepGoing()) + scheduler->increaseEpoch(); + } + scheduler->finished(); + + model->finalize(); // allow async to sync before final save --@TODO: rename, or move into save() + + // Avoid saving the model twice if it has been loaded and training did not progress + if(!trainState->loaded) + model->save(true); + + // Signal success to a potential MPI runner + model = nullptr; // release any reference to MPI that model may hold + scheduler = nullptr; // as above + finalizeMPI(std::move(mpi)); + + } catch(DivergenceException& e) { // handling divergent training if scheduler is configured + // to throw via --throw-on-divergence + if(options_->get("fp16-fallback-to-fp32", false)) { + auto precisions = options_->get>("precision"); + Type parameterType = typeFromString(precisions[0]); + if(parameterType == Type::float16) { + // we diverged, but we were apparently training with fp16 and fallback to fp32 + // is enabled. There is a chance we can rescue the training run by restarting + // from the last checkpoint but using fp32 precision training. + LOG(warn, "Training diverged, but --fp16-fallback-to-fp32 is enabled. " + "Attempting restart from the last checkpoint with fp32 precision."); + + // undo all options that would be set for fp16 training + options_ = options_->with( + "fp16", false, + "precision", std::vector({"float32", "float32"}), + "cost-scaling", std::vector({}) + ); + + // this gets checked at final do-while condition + restartTraining = true; + } else { + // We diverged and fallback is enabled, but we are already training with fp32, + // hence rethrow and let training die with error. + LOG(warn, "Training diverged, rethrowing divergence exception"); + throw e; + } + } else { + // We diverged and no fallback enabled, hence rethrow and let training die with error. + LOG(warn, "Training diverged, rethrowing divergence exception"); + throw e; + } } - - if(scheduler->keepGoing()) - scheduler->increaseEpoch(); - } - scheduler->finished(); - - model->finalize(); // allow async to sync before final save --@TODO: rename, or move into save() - - // Avoid saving the model twice if it has been loaded and training did not progress - if(!trainState->loaded) - model->save(true); - - // Signal success to a potential MPI runner - model = nullptr; // release any reference to MPI that model may hold - scheduler = nullptr; // as above - finalizeMPI(std::move(mpi)); + } while(restartTraining); } }; diff --git a/src/training/training_state.h b/src/training/training_state.h index 2fb9209fa..800dd60c7 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -73,6 +73,12 @@ class TrainingState { // Number of updates seen since last display size_t updatesDisp{0}; + // Running average of training cost per label + float lossAvgSlow{0}; + float lossAvgFast{0}; + // Running variance of training cost per label + float lossVarSlow{0}; + // Running average of gradient norm float gradientNormAvg{0}; // Running variance of gradient norm @@ -230,6 +236,10 @@ class TrainingState { samplesDisp = config["disp-samples"].as(); updatesDisp = config["disp-updates"].as(); + lossAvgSlow = config["loss-avg-slow"].as(); + lossAvgFast = config["loss-avg-fast"].as(); + lossVarSlow = config["loss-var-slow"].as(); + gradientNormAvg = config["gradient-norm-avg"].as(); gradientNormVar = config["gradient-norm-var"].as(); @@ -277,6 +287,10 @@ class TrainingState { config["disp-samples"] = samplesDisp; config["disp-words"] = wordsDisp; + config["loss-avg-slow"] = lossAvgSlow; + config["loss-avg-fast"] = lossAvgFast; + config["loss-var-slow"] = lossVarSlow; + config["gradient-norm-avg"] = gradientNormAvg; config["gradient-norm-var"] = gradientNormVar; From ea8a2db445310ead64df7c3ffbf401819307c0f6 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Wed, 28 Jun 2023 15:55:03 +0000 Subject: [PATCH 235/254] Merged PR 30038: Add a comment that automatic builds are disabled --- azure-pipelines.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3b1bfff3f..0f19a0f8d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,8 +13,11 @@ parameters: type: boolean default: true -# The pipeline CI trigger is set on the branch master only and PR trigger on a -# (non-draft) pull request to any branch +# Warning: the current branch policies disable the automatic triggering to +# minimize VM usage! +# The configuration below specifies that the pipeline CI trigger is set on the +# branch master only and a PR trigger is on a (non-draft) pull request to any +# branch. trigger: # This minimizes the number of parallel pipeline runs. When a pipeline is # running, the CI waits until it is completed before starting another one. From 0fa11f5cb4461857ea34f08a85168cdd683bb86f Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 28 Jun 2023 16:07:02 +0000 Subject: [PATCH 236/254] Merged PR 30034: Automatically create marian-YYYY-MM-DD-GIT_REV.tgz Small simplification to create the correctly named tarball via `make marian_tgz` resulting in e.g. `marian-2023-06-28-8390b1d.tgz` This will be executed every time make `marian_tgz` is invoked, but depends on the correct targets and will update changed commit revisions etc. Uses PST time zone. --- cmake/Tarball.cmake | 30 ++++++++++++++++++++++++++++++ src/CMakeLists.txt | 29 +++-------------------------- 2 files changed, 33 insertions(+), 26 deletions(-) create mode 100644 cmake/Tarball.cmake diff --git a/cmake/Tarball.cmake b/cmake/Tarball.cmake new file mode 100644 index 000000000..8611f5553 --- /dev/null +++ b/cmake/Tarball.cmake @@ -0,0 +1,30 @@ +# marian-YYYY-MM-DD-revision.tgz +# This combines marian, marian_decoder in a single TAR file for +# execution in MSFT internal tools FLO and Singularity. + +execute_process( + COMMAND bash -c "TZ=America/Los_Angeles date +%Y-%m-%d" + OUTPUT_VARIABLE TGZ_DATE + OUTPUT_STRIP_TRAILING_WHITESPACE) + +execute_process( + COMMAND git rev-parse --short=7 HEAD + OUTPUT_VARIABLE TGZ_REV + OUTPUT_STRIP_TRAILING_WHITESPACE) + +message("Generating ${CWD}/marian-${TGZ_DATE}-${TGZ_REV}.tgz") + +# check if pigz is available for faster compression +execute_process( + COMMAND bash -c "which pigz || which gzip" + OUTPUT_VARIABLE COMPRESS + OUTPUT_STRIP_TRAILING_WHITESPACE) + +execute_process( + COMMAND tar -I ${COMPRESS} -cvvf "${CWD}/marian-${TGZ_DATE}-${TGZ_REV}.tgz" -C "${CWD}" + marian + marian-decoder + marian-scorer + marian-vocab + marian-conv + WORKING_DIRECTORY "${CWD}") \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f9d5a5e5b..d1f119335 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -235,33 +235,10 @@ if (NOT COMPILE_LIBRARY_ONLY) set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv) - # marian.zip and marian.tgz - # This combines marian, marian_decoder in a single ZIP or TAR file for - # execution in MSFT internal tools FLO and Philly. - # For Philly submission, we need statically-linked versions to deal with - # library dependencies, so this target is only enabled for static builds. - add_custom_command( - OUTPUT "${CMAKE_BINARY_DIR}/marian.zip" - COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip" - "${CMAKE_BINARY_DIR}/marian" - "${CMAKE_BINARY_DIR}/marian-decoder" - "${CMAKE_BINARY_DIR}/marian-scorer" - "${CMAKE_BINARY_DIR}/marian-vocab" - "${CMAKE_BINARY_DIR}/marian-conv" + # generate the tgz file via a custom script. This will always re-create the tarball + add_custom_target(marian_tgz + COMMAND ${CMAKE_COMMAND} -DCWD=${CMAKE_BINARY_DIR} -P ${CMAKE_SOURCE_DIR}/cmake/Tarball.cmake DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv) - add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip") - - add_custom_command( - OUTPUT "${CMAKE_BINARY_DIR}/marian.tgz" - COMMAND tar -cvvzf "${CMAKE_BINARY_DIR}/marian.tgz" -C "${CMAKE_BINARY_DIR}" - "marian" - "marian-decoder" - "marian-scorer" - "marian-vocab" - "marian-conv" - DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv) - add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz") - add_custom_target(philly DEPENDS marian_tgz marian_zip) if(COMPILE_SERVER) add_executable(marian_server command/marian_server.cpp) From 0df870c12b87b5c43634de1498192261dfb1f6f8 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 29 Jun 2023 23:26:10 +0000 Subject: [PATCH 237/254] Merged PR 28958: LSH for GPU LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff eg. decoding $22k sentences, mini-batch 256, maxi-batch 10 using production SSRU model: Without LSH: 53.86 sec. With LSH: 108.27 --- CHANGELOG.md | 1 + VERSION | 2 +- src/data/shortlist.cpp | 4 - src/layers/lsh.cpp | 123 ++++++++++------ src/tensors/gpu/tensor_operators.cu | 220 ++++++++++++++++++++++++++++ src/tensors/tensor_operators.h | 5 + 6 files changed, 304 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2a9a9bdd..8778abeed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff - Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown. - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian. diff --git a/VERSION b/VERSION index 21decde5d..97cc69d7f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.4 +v1.12.5 diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index da5a6572f..909734ea6 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -95,10 +95,6 @@ Expr LSHShortlist::getIndicesExpr() const { } void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - - ABORT_IF(input->graph()->getDeviceId().type == DeviceType::gpu, - "LSH index (--output-approx-knn) currently not implemented for GPU"); - indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_), [this](Expr node) { node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index eedf227ee..7dfe83d22 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -51,7 +51,14 @@ void fillRandomRotationMatrix(Tensor output, Ptr allocator) { void encode(Tensor output, Tensor input) { int nBits = input->shape()[-1]; // number of bits is equal last dimension of float matrix int nRows = input->shape().elements() / nBits; - faiss::fvecs2bitvecs(input->data(), output->data(), (size_t)nBits, (size_t)nRows); + if (input->getDeviceId().type == DeviceType::cpu) { + faiss::fvecs2bitvecs(input->data(), output->data(), (size_t)nBits, (size_t)nRows); + } + else { +#ifdef CUDA_FOUND + marian::gpu::Float2Bit(output, input); +#endif +} } void encodeWithRotation(Tensor output, Tensor input, Tensor rotation, Ptr allocator) { @@ -123,56 +130,80 @@ Expr searchEncoded(Expr encodedQuery, Expr encodedWeights, int dimK, int firstNR Expr encodedQuery = inputs[0]; Expr encodedWeights = inputs[1]; - int bytesPerVector = encodedWeights->shape()[-1]; - int wRows = encodedWeights->shape().elements() / bytesPerVector; - - // we use this with Factored Segmenter to skip the factor embeddings at the end - if(firstNRows != 0) - wRows = firstNRows; + if (encodedQuery->val()->getDeviceId().type == DeviceType::cpu) { + int bytesPerVector = encodedWeights->shape()[-1]; + int wRows = encodedWeights->shape().elements() / bytesPerVector; + + // we use this with Factored Segmenter to skip the factor embeddings at the end + if(firstNRows != 0) + wRows = firstNRows; - ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently? + ABORT_IF(dimK > wRows, "k is larger than number of candidate values?"); // @TODO: use min(k, wRows) silently? #if _MSC_VER // unfortunately MSVC is horrible at loop unrolling, so we fall back to the old code (hrmph!) @TODO: figure this out one day - int qRows = encodedQuery->shape().elements() / bytesPerVector; - - uint8_t* qCodes = encodedQuery->val()->data(); - uint8_t* wCodes = encodedWeights->val()->data(); - - // use actual faiss code for performing the hamming search. - std::vector distances(qRows * dimK); - std::vector ids(qRows * dimK); - faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)dimK, ids.data(), distances.data()}; - faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0); - - // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis. - // The sorting is required as we later do a binary search on those values for reverse look-up. - uint32_t* outData = out->val()->data(); - - int numHypos = out->shape().elements() / dimK; - for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { - size_t startIdx = dimK * hypoIdx; - size_t endIdx = startIdx + dimK; - for(size_t i = startIdx; i < endIdx; ++i) - outData[i] = (uint32_t)ids[i]; - if(!noSort) - std::sort(outData + startIdx, outData + endIdx); - } + int qRows = encodedQuery->shape().elements() / bytesPerVector; + + uint8_t* qCodes = encodedQuery->val()->data(); + uint8_t* wCodes = encodedWeights->val()->data(); + + // use actual faiss code for performing the hamming search. + std::vector distances(qRows * dimK); + std::vector ids(qRows * dimK); + faiss::int_maxheap_array_t res = {(size_t)qRows, (size_t)dimK, ids.data(), distances.data()}; + faiss::hammings_knn_hc(&res, qCodes, wCodes, (size_t)wRows, (size_t)bytesPerVector, 0); + + // Copy int64_t indices to Marian index type and sort by increasing index value per hypothesis. + // The sorting is required as we later do a binary search on those values for reverse look-up. + uint32_t* outData = out->val()->data(); + + int numHypos = out->shape().elements() / dimK; + for (size_t hypoIdx = 0; hypoIdx < numHypos; ++hypoIdx) { + size_t startIdx = dimK * hypoIdx; + size_t endIdx = startIdx + dimK; + for(size_t i = startIdx; i < endIdx; ++i) + outData[i] = (uint32_t)ids[i]; + if(!noSort) + std::sort(outData + startIdx, outData + endIdx); + } #else // this is using the new code for search, other parts of the code, like conversion are fine. - IndexType* outData = out->val()->data(); - auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { - outData[rowId * dimK + k] = kthColId; - }; - - Parameters params; - params.k = dimK; - params.queryRows = encodedQuery->val()->data(); - params.numQueryRows = encodedQuery->shape().elements() / bytesPerVector; - params.codeRows = encodedWeights->val()->data(); - params.numCodeRows = wRows; - params.bytesPerVector = bytesPerVector; - - hammingTopK(params, gather); + IndexType* outData = out->val()->data(); + auto gather = [outData, dimK](IndexType rowId, IndexType k, IndexType kthColId, DistType /*dist*/) { + outData[rowId * dimK + k] = kthColId; + }; + + Parameters params; + params.k = dimK; + params.queryRows = encodedQuery->val()->data(); + params.numQueryRows = encodedQuery->shape().elements() / bytesPerVector; + params.codeRows = encodedWeights->val()->data(); + params.numCodeRows = wRows; + params.bytesPerVector = bytesPerVector; + + hammingTopK(params, gather); +#endif + } + else { +#ifdef CUDA_FOUND + Ptr backend = out->val()->getBackend(); + + const size_t CHUNK = 128; + const size_t MBYTE = 1024 * 1024; + const size_t GROW = CHUNK * MBYTE; + Ptr alloc = marian::New(backend->getDeviceId(), 0, GROW); + + auto memory = alloc->alloc(requiredBytes(out->shape(), marian::Type::uint32)); + + // not required for calculations. Useful for debugging + Tensor outCounts = nullptr; //marian::TensorBase::New(memory, out->shape(), marian::Type::uint32, backend); + + uint16_t numHash = (uint16_t) encodedWeights->shape()[-1] * 8; + + marian::gpu::HammmingAndSort(out->val(), outCounts, + encodedWeights->val(), encodedQuery->val(), + dimK, 0, numHash, + alloc, backend); #endif + } }; Shape kShape({currBeamSize, batchSize, dimK}); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 51e6f2f2d..508e1e3e7 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1,3 +1,9 @@ +# if defined(_MSC_VER) +#define NPP_MAX_32U ( 4294967295U ) /**< Maximum 32-bit unsigned integer */ +#else +#include +#endif + #include "common/types.h" #include "tensors/tensor_operators.h" @@ -3391,5 +3397,219 @@ void PoolingWithMaskingBackward(Tensor adj, width, lastWidth); } + +////////////////////////////////////////////////////////////////////////////////////////// +// Calc sign(x) for vectors of float. GPU counterpart to Faiss' CPU fvecs2bitvecs() +__global__ void Float2Bit(const float *in, uint32_t *out, int batch, int dim, int outDim) +{ + int batchIdx = blockIdx.x; + const float *inBatchOffset = in + batchIdx * dim; + uint32_t *outBatchOffset = out + batchIdx * outDim; + + int outDimIdx = threadIdx.x; + while (outDimIdx < outDim) { + const float *inDimOffset = inBatchOffset + outDimIdx * 32; + uint32_t &outDimOffset = outBatchOffset[outDimIdx]; + uint32_t outVal = 0; + uint32_t mask = 1; + + for (int bitIdx = 0; bitIdx < 32; ++bitIdx) { + if (inDimOffset[bitIdx] >= 0) + outVal |= mask; + + mask <<= 1; + } + //printf("outVal=%lu \n", outVal); + outDimOffset = outVal; + outDimIdx += blockDim.x; + } +} + +// Calc sign(x) for vectors of float. GPU counterpart to Faiss' CPU fvecs2bitvecs() +void Float2Bit(marian::Tensor output, const marian::Tensor input) +{ + int dim = input->shape()[-1]; + assert(dim % 32 == 0); + int batch = input->shape().elements() / input->shape()[-1]; + int outDim = output->shape()[-1] / 4; + + unsigned threads = std::min((unsigned)MAX_THREADS, (unsigned)outDim); + + Float2Bit<<>>(input->data(), output->data(), batch, dim, outDim); + CUDA_CHECK(cudaGetLastError()); +} + +////////////////////////////////////////////////////////////////////////////////////////// +// Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo +// https://www.geeksforgeeks.org/counting-sort/ +__global__ void HammmingAndSort(const uint32_t *weightHash, + const uint32_t *inputHash, + uint16_t *hamming, + uint32_t *outCounts, + uint32_t *outIdx, + uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range, + int hashDim, int dim, int batch) +{ + extern __shared__ uint32_t sharedCounts[]; + + int batchIdx = blockIdx.x; + + uint32_t *stopVal = sharedCounts + range; + uint16_t *hammingBatchOffset = hamming + ? hamming + batchIdx * dim + : (uint16_t*) (sharedCounts + range); + + uint32_t *outCountsBatchOffset = outCounts ? outCounts + batchIdx * kBest : nullptr; + uint32_t *outIdxBatchOffset = outIdx ? outIdx + batchIdx * kBest : nullptr; + const uint32_t *inputHashOffset = inputHash + batchIdx * hashDim; + + // init count array + int countsIdx = threadIdx.x; + while (countsIdx < range) { + sharedCounts[countsIdx] = 0; + countsIdx += blockDim.x; + } + + __syncthreads(); + int dimIdx = threadIdx.x; + while (dimIdx < dim) { + // Hamming distance between input and hashes + const uint32_t *weightHashOffset = weightHash + dimIdx * hashDim; + + uint16_t dist = 0; + for (int hashDimIdx = 0; hashDimIdx < hashDim; ++hashDimIdx) { + const uint32_t &inputHashes = inputHashOffset[hashDimIdx]; + const uint32_t &weightHashes = weightHashOffset[hashDimIdx]; + uint32_t diff = inputHashes ^ weightHashes; + uint16_t distT = __popc(diff); + dist += distT; + } + + hammingBatchOffset[dimIdx] = dist; + + // counts + uint32_t countIdx = dist - minVal; + assert(countIdx < range); +#if __CUDA_ARCH__ >= 600 + atomicAdd_block(&sharedCounts[countIdx], 1); +#endif + dimIdx += blockDim.x; + } + + // Start counting sort algorithm + __syncthreads(); + // Calc acumulate counts + if (threadIdx.x == 0) { + if (sharedCounts[0] >= kBest) { + (*stopVal) = 0; + } + else { + for (int rangeIdx = 1; rangeIdx < range; ++rangeIdx) { + uint32_t preval = sharedCounts[rangeIdx - 1]; + sharedCounts[rangeIdx] += preval; + if (sharedCounts[rangeIdx] >= kBest) { + (*stopVal) = rangeIdx; + break; + } + } + } + } + + // init output - reuse count array + __syncthreads(); + int rangeIdx = (*stopVal) + threadIdx.x + 1; + while (rangeIdx < range) { + sharedCounts[rangeIdx] = NPP_MAX_32U; + rangeIdx += blockDim.x; + } + + __syncthreads(); + // Reduce + dimIdx = threadIdx.x; + while (dimIdx < dim) { + uint16_t val = hammingBatchOffset[dimIdx]; + assert(val >= minVal); + assert(val <= maxVal); + + uint32_t countIdx = val - minVal; + assert(countIdx < range); + uint32_t &outIdx = sharedCounts[countIdx]; + + if (outIdx != NPP_MAX_32U) { + uint32_t prevOutIdx; +// Not supported in Maxwells or older +// Not supported in Maxwells or older +#if __CUDA_ARCH__ >= 600 + prevOutIdx = atomicAdd_block(&outIdx, (uint32_t) -1); +#else + prevOutIdx = 0; +#endif + assert(prevOutIdx > 0); + assert(prevOutIdx - 1 < dim); + + if (prevOutIdx - 1 < kBest) { + if (outCountsBatchOffset) outCountsBatchOffset[prevOutIdx - 1] = val; + if (outIdxBatchOffset) outIdxBatchOffset[prevOutIdx - 1] = dimIdx; + } + } + + dimIdx += blockDim.x; + } +} + +// Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo +// https://www.geeksforgeeks.org/counting-sort/ +void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts, + const marian::Tensor weightHash, + const marian::Tensor inputHash, + uint32_t kBest, uint16_t minVal, uint16_t maxVal, + marian::Ptr &alloc, + marian::Ptr &backend) +{ + size_t SHARED_MEM_SIZE = 48000; + + assert(weightHash->shape()[-1] == inputHash->shape()[-1]); + int hashDim = weightHash->shape()[-1] / 4; + + int dim = weightHash->shape().elements() / weightHash->shape()[-1]; + int inputBatch = inputHash->shape().elements() / inputHash->shape()[-1]; + + uint16_t range = maxVal - minVal + 1; + + marian::Shape hammingShape = inputHash->shape(); + hammingShape.set(-1, (int) kBest); + + + size_t mem = range * sizeof(uint32_t) // counts + + sizeof(uint32_t) // stopval + + dim * sizeof(uint16_t); // hamming; + + marian::Tensor hamming; + if (mem > SHARED_MEM_SIZE) { + // shared memory too small. Write haming distance to global mem instead + mem = range *sizeof(uint32_t) + sizeof(uint32_t); + assert(mem <= SHARED_MEM_SIZE); + + hammingShape.set(-1, dim); + auto memory = alloc->alloc(requiredBytes(hammingShape, marian::Type::uint16)); + + hamming = marian::TensorBase::New(memory, hammingShape, marian::Type::uint16, backend); + } + + HammmingAndSort<<>> + (weightHash->data(), + inputHash->data(), + hamming ? hamming->data() : nullptr, + outCounts ? outCounts->data() : nullptr, + outIdx ? outIdx->data() : nullptr, + kBest, minVal, maxVal, range, + hashDim, dim, inputBatch); + CUDA_CHECK(cudaGetLastError()); + + if (hamming) { + alloc->free(hamming->memory()); + } +} + } // namespace gpu } // namespace marian diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 178bb6920..31bd1e14f 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -44,6 +44,11 @@ DISPATCH4(IsNaN, const Tensor, Ptr, bool&, bool&); #ifdef CUDA_FOUND namespace gpu { bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +void Float2Bit(marian::Tensor output, const marian::Tensor input); +void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts, + const marian::Tensor weightHash, const marian::Tensor inputHash, + uint32_t kBest, uint16_t minVal, uint16_t maxVal, + marian::Ptr &alloc, marian::Ptr &backend); } #endif From cc66cf617e931a60f6f3a05df4524b5d670266ef Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 29 Jun 2023 23:51:17 +0000 Subject: [PATCH 238/254] Merged PR 29966: More metrics in Marian and MBR scripts This PR adds: * An implementation of BLEURT with conversion script * Some code refactoring for COMET models * A more cleanly separated "evaluate" and "embed" functionality for COMET/COMET-QE/BLEURT * A number of MBR-related scripts. --- CHANGELOG.md | 7 + VERSION | 2 +- scripts/bleurt/bleurt2marian.py | 223 ++++++++++++++++++ scripts/comet/comet2marian.py | 18 +- scripts/mbr/README.md | 54 +++++ scripts/mbr/comet/comet_mbr.sh | 133 +++++++++++ .../mbr/comet/comet_mbr_with_embeddings.py | 125 ++++++++++ scripts/mbr/generic/explode_collapse.pl | 43 ++++ scripts/mbr/generic/metrics/bleu.sh | 3 + scripts/mbr/generic/metrics/bleurt.sh | 12 + scripts/mbr/generic/metrics/chrf.sh | 3 + scripts/mbr/generic/rescore.pl | 68 ++++++ scripts/mbr/generic/stupid_mbr.sh | 60 +++++ scripts/metrics/.gitignore | 2 + scripts/metrics/Dockerfile | 43 ++++ scripts/metrics/README.md | 36 +++ scripts/metrics/compare.sh | 116 +++++++++ scripts/metrics/docker-run.sh | 20 ++ scripts/metrics/marian-score.sh | 126 ++++++++++ scripts/metrics/setup.sh | 15 ++ src/CMakeLists.txt | 1 + src/command/marian_evaluator.cpp | 15 ++ src/command/marian_main.cpp | 4 + src/common/config.cpp | 51 +--- src/common/config.h | 7 +- src/common/config_parser.cpp | 130 +++++++++- src/common/config_parser.h | 3 +- src/common/config_validator.cpp | 20 +- src/common/config_validator.h | 6 +- src/data/corpus_base.cpp | 21 +- src/data/corpus_base.h | 20 +- src/data/text_input.cpp | 2 +- src/embedder/vector_collector.cpp | 49 +++- src/embedder/vector_collector.h | 37 ++- src/evaluator/evaluator.h | 155 ++++++++++++ src/graph/expression_operators.cpp | 10 +- src/graph/node_operators_binary.h | 2 + src/layers/embedding.cpp | 3 +- src/layers_new/attention.h | 3 +- src/layers_new/neuralnet.h | 12 +- src/layers_new/transformer.h | 5 +- src/models/bleurt.h | 217 +++++++++++++++++ src/models/comet_qe.h | 138 +++++++---- src/models/model_base.h | 12 +- src/models/model_factory.cpp | 134 ++++++----- src/tensors/gpu/gpu_info.cpp | 19 ++ 46 files changed, 1999 insertions(+), 186 deletions(-) create mode 100644 scripts/bleurt/bleurt2marian.py create mode 100644 scripts/mbr/README.md create mode 100755 scripts/mbr/comet/comet_mbr.sh create mode 100644 scripts/mbr/comet/comet_mbr_with_embeddings.py create mode 100755 scripts/mbr/generic/explode_collapse.pl create mode 100755 scripts/mbr/generic/metrics/bleu.sh create mode 100755 scripts/mbr/generic/metrics/bleurt.sh create mode 100755 scripts/mbr/generic/metrics/chrf.sh create mode 100755 scripts/mbr/generic/rescore.pl create mode 100755 scripts/mbr/generic/stupid_mbr.sh create mode 100644 scripts/metrics/.gitignore create mode 100644 scripts/metrics/Dockerfile create mode 100644 scripts/metrics/README.md create mode 100755 scripts/metrics/compare.sh create mode 100755 scripts/metrics/docker-run.sh create mode 100755 scripts/metrics/marian-score.sh create mode 100755 scripts/metrics/setup.sh create mode 100644 src/command/marian_evaluator.cpp create mode 100644 src/evaluator/evaluator.h create mode 100644 src/models/bleurt.h create mode 100644 src/tensors/gpu/gpu_info.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 8778abeed..a436308c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts. +- `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20 +- A bunch of scripts for metrics use and early MBR experiments - LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff - Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown. @@ -21,6 +24,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Handle copying from fp32 or fp16 embeddings in embedder mode correctly. - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp). +### Changed +- Removed --num-devices N option that wasn't really used by anyone (I assume). + + ## [1.12.0] - 2023-02-20 ### Added diff --git a/VERSION b/VERSION index 97cc69d7f..f15731572 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.5 +v1.12.6 diff --git a/scripts/bleurt/bleurt2marian.py b/scripts/bleurt/bleurt2marian.py new file mode 100644 index 000000000..25aa8206f --- /dev/null +++ b/scripts/bleurt/bleurt2marian.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +This script converts Google BLEURT models to Marian weight file. +""" + +import argparse +import logging as log +import numpy as np +import yaml +from pathlib import Path + +BLEURT_LOCATION = 'lucadiliello/BLEURT-20' + +log.basicConfig(level=log.INFO) + +parser = argparse.ArgumentParser(description='Convert Google BLEURT models to Marian weight file.') +parser.add_argument('--marian', '-m', help='Output path for Marian weight file', required=True) +parser.add_argument('--spm', '-spm', type=Path, help='Save tokenizer SPM file here', required=False) +args = parser.parse_args() + +def load_bleurt_model(): + from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer + + bleurt_model = BleurtForSequenceClassification.from_pretrained(BLEURT_LOCATION) + bleurt_model.eval() + tokenizer = BleurtTokenizer.from_pretrained(BLEURT_LOCATION) + vocab_file = None + if tokenizer.vocab_file and Path(tokenizer.vocab_file).exists(): + vocab_file = tokenizer.vocab_file + return bleurt_model, vocab_file + +bleurt_model, vocab_file = load_bleurt_model() + +if args.spm: + vocab_file = vocab_file and Path(vocab_file) + if vocab_file and vocab_file.exists(): + if not args.spm.parent.exists(): + raise Exception(f"Directory {args.spm.parent} does not exist") + log.info(f"Copying {vocab_file} to {args.spm}") + args.spm.write_bytes(vocab_file.read_bytes()) + else: + raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually") + +marianModel = dict() +config = dict() + +config["type"] = "bleurt" +config["tied-embeddings-all"] = True +config["tied-embeddings-src"] = False +config["transformer-ffn-depth"] = 2 +config["transformer-ffn-activation"] = "gelu" # figure this out dynamically +config["transformer-train-position-embeddings"] = True +config["transformer-preprocess"] = "" +config["transformer-postprocess"] = "dan" +config["transformer-postprocess-emb"] = "nd" +config["bert-train-type-embeddings"] = True +config["bert-type-vocab-size"] = 2 +config["comet-prepend-zero"] = True +config["input-join-fields"] = True +config["version"] = "bleurt2marian.py conversion" +config["enc-depth"] = 0 + +def yaml2np(config): + configYamlStr = yaml.dump(config, default_flow_style=False) + print("\nMarian config:") + print(configYamlStr) + + desc = bytes(configYamlStr, 'ascii') + b'\x00' + npDesc = np.chararray((len(desc),)) + npDesc.dtype = np.int8 + for i, b in enumerate(desc): + npDesc[i] = b + return npDesc + +def convert(pd, srcs, trg, transpose=True, bias=False): + if len(srcs) == 1: + for src in srcs: + num = pd[src].detach().numpy() + if bias: + marianModel[trg] = num.copy() + else: + if transpose: + marianModel[trg] = np.transpose(num).copy() + else: + marianModel[trg] = num + else: # path that joins matrices together for fused self-attention + nums = [pd[src].detach().numpy() for src in srcs] + if bias: + nums = [np.transpose(num) for num in nums] + marianModel[trg] = np.stack(nums, axis=0).copy() + +def extract(layer, nth, level): + name = type(layer).__name__ + print(" " * level, nth, name) + + if "BleurtEncoder" in name: + # embedding projection + prefix = "BleurtEncoder" + + pd = dict(layer.named_parameters()) + for n in pd: + if "embedding_projection" in n: + print(" " * (level + 1), n, pd[n].shape) + + convert(pd, ["embedding_projection.weight"], f"{prefix}->encoder->eProj->weight") + convert(pd, ["embedding_projection.bias"], f"{prefix}->encoder->eProj->bias", bias=True) + + # continue recursing down the model structure + recurse(layer, level + 1) + + elif "BleurtLayer" in name: + pd = dict(layer.named_parameters()) + for n in pd: + print(" " * (level + 1), n, pd[n].shape) + + prefix = "BleurtEncoder" + blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->selfAttentionBlock" + + if not "transformer-dim-model" in config: + query = pd["attention.self.query.weight"].detach().numpy() + config["transformer-dim-model"] = query.shape[1] + + # self-attention + # query transformation + convert(pd, ["attention.self.query.weight"], f"{blockPrefix}->selfAttention->qProj->weight") + convert(pd, ["attention.self.query.bias"], f"{blockPrefix}->selfAttention->qProj->bias", bias=True) + + # key transformation + convert(pd, ["attention.self.key.weight"], f"{blockPrefix}->selfAttention->kProj->weight") + convert(pd, ["attention.self.key.bias"], f"{blockPrefix}->selfAttention->kProj->bias", bias=True) + + # values transformation + convert(pd, ["attention.self.value.weight"], f"{blockPrefix}->selfAttention->vProj->weight") + convert(pd, ["attention.self.value.bias"], f"{blockPrefix}->selfAttention->vProj->bias", bias=True) + + # output transformation + convert(pd, ["attention.output.dense.weight"], f"{blockPrefix}->selfAttention->oProj->weight") + convert(pd, ["attention.output.dense.bias"], f"{blockPrefix}->selfAttention->oProj->bias", bias=True) + + # self-attention layer-norm + convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["attention.output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + + # ffn + # first ffn layer + blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->filterBlock" + + convert(pd, ["intermediate.dense.weight"], f"{blockPrefix}->layers->at(0)->as()->weight") + convert(pd, ["intermediate.dense.bias"], f"{blockPrefix}->layers->at(0)->as()->bias", bias=True) + # second ffn layer + convert(pd, ["output.dense.weight"], f"{blockPrefix}->layers->at(3)->as()->weight") + convert(pd, ["output.dense.bias"], f"{blockPrefix}->layers->at(3)->as()->bias", bias=True) + # ffn layer-norm + convert(pd, ["output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + + config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1] + config["transformer-heads"] = layer.attention.self.num_attention_heads + config["enc-depth"] += 1 + + elif "BleurtEmbeddings" in name: + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + + # @TODO: this is a dirty trickery and should be solved differently in the future + npWemb = pd["word_embeddings.weight"].detach().numpy() + # put embedding of [CLS] in place of [PAD] (0) + npWemb[0, :] = npWemb[312, :] + # put embedding of [SEP] in place of + npWemb[1, :] = npWemb[313, :] + marianModel["Wemb"] = npWemb + + prefix = "BleurtEncoder" + + npPos = pd["position_embeddings.weight"].detach().numpy() + # this should be moved out of the encoder into a special embedding layer + marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos + + npType = pd["token_type_embeddings.weight"].detach().numpy() + marianModel[f"{prefix}->typeEmbedding->embeddings"] = npType + + # post-embedding layer normalization + convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True) + convert(pd, ["LayerNorm.bias"], f"{prefix}->encoder->preprocessor->norm->bias", bias=True) + + config["dim-emb"] = npWemb.shape[1] + config["dim-vocabs"] = [ npWemb.shape[0] ] + config["max-length"] = npPos.shape[0] + + # this will be the bleurt pooler right here: + elif name == "BleurtPooler": + for n, p in layer.named_parameters(): + print(" " * (level + 1), n, p.shape) + pd = dict(layer.named_parameters()) + + + prefix = "BleurtPooler" + convert(pd, ["dense.weight"], f"{prefix}->layers->at(0)->as()->weight") + convert(pd, ["dense.bias"], f"{prefix}->layers->at(0)->as()->bias", bias=True) + + else: + recurse(layer, level + 1) + +def recurse(parent, level=0): + for i, child in enumerate(parent.children()): + extract(child, i, level) + +recurse(bleurt_model) + +# last layer +prefix = "BleurtPooler" +pd = dict(bleurt_model.named_parameters()) +convert(pd, ["classifier.weight"], f"{prefix}->layers->at(3)->as()->weight") +convert(pd, ["classifier.bias"], f"{prefix}->layers->at(3)->as()->bias", bias=True) + +marianModel["special:model.yml"] = yaml2np(config) + +for m in marianModel: + print(m, marianModel[m].shape) + +print("Saving Marian model to %s" % (args.marian,)) +np.savez(args.marian, **marianModel) \ No newline at end of file diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index 8ef4d29fc..69c8abf59 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -7,13 +7,15 @@ import logging as log import numpy as np import yaml - from pathlib import Path ## Uncomment to see model names supported by your installed version of unbabel-comet # from comet.models import available_metrics # supported_comets = [m for m in available_metrics if 'qe' in m.lower()] -supported_comets = ['wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da'] +supported_comets = [ + 'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', + 'wmt20-comet-da', 'wmt21-comet-da' +] log.basicConfig(level=log.INFO) parser = argparse.ArgumentParser(description='Convert Unbabel COMET-QE models to Marian weight file.') @@ -80,11 +82,17 @@ def load_comet_model(model_path): else: raise Exception(f"Could not locate or save the vocab file: {vocab_file}; please remove --spm argument and try downloading the file manually") - marianModel = dict() - config = dict() -config["type"] = "comet-qe" + +model_type = type(cometModel).__name__ +if model_type == "RegressionMetric": + config["type"] = "comet" +elif model_type == "ReferencelessRegression": + config["type"] = "comet-qe" +else: + raise Exception(f'Unknown type of model {model_type}') + config["tied-embeddings-all"] = True config["tied-embeddings-src"] = False config["transformer-ffn-depth"] = 2 diff --git a/scripts/mbr/README.md b/scripts/mbr/README.md new file mode 100644 index 000000000..1ccdb370b --- /dev/null +++ b/scripts/mbr/README.md @@ -0,0 +1,54 @@ +# Some notes on MBR + +All of this is experimental, use at your own risk. + +## MBR with COMET + +This concerns the scipts in the `comet` folder: + +This script is for efficient MBR with COMET. COMET allows to embed source and hypotheses separatly which makes it very easy to optimize. +Only the final embbedings are used to create the NxN scores. + +Example usage: + +### prepare the source and samples +sacrebleu -t wmt21 -l en-de --echo src > wmt21.src +cat wmt21.src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src +cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \ + -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out + +### run MBR with COMET +cat wmt21.128.out | ~/marian-dev/scripts/mbr/comet/comet_mbr.sh -m wmt20-comet-da.npz -n 128 -s wmt21.src -g 8 > wmt21.128.mbr.out +cat wmt21.128.mbr.out | cut -f 4 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text + + +## "Stupid" MBR (generic) + +This concerns the scipts in the `generic` folder + +This script can be used to do "stupid" MBR (i.e. all-vs-all MBR with any reference-based metric specfied in the metrics folder). +The subscipt in the metrics folder need to be able to calculate sentence-level results. This should be done as efficiently as possible +in order to score all NxN variants (where N is sample size). The explode_collape.pl script below does some smart deduping as far as +possible, but the complexity will still be close to NxN. + +Example usage: + +### prepare the sample +``` +sacrebleu -t wmt21 -l en-de --echo src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src +cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \ + -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out +``` + +### run MBR, here with ChrF +``` +cat wmt21.128.out | ~/marian-dev/scripts/mbr/generic/stupid_mbr.sh 128 128 chrf > wmt21.128.sorted.out +``` + +### select the top translation according to ChrF MBR and evaluate result + +``` +cat wmt21.128.sorted.out | grep ^BEST | cut -f 3 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text +``` \ No newline at end of file diff --git a/scripts/mbr/comet/comet_mbr.sh b/scripts/mbr/comet/comet_mbr.sh new file mode 100755 index 000000000..9ba97b4a7 --- /dev/null +++ b/scripts/mbr/comet/comet_mbr.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +OPTIONS=$(getopt -o t:s:o:n:m:g:h --long hyps:,source:,output:,num_hyps:,model:,num_gpus:,help -- "$@") +eval set -- "$OPTIONS" + +while true; do + case "$1" in + -t|--hyps) + >&2 echo "Option hyps=$2" + hyps_file=$2 + shift 2;; + -s|--source) + >&2 echo "Option source=$2" + source_file=$2 + shift 2;; + -o|--output) + >&2 echo "Option output=$2" + out_file=$2 + shift 2;; + -n|--num_hyps) + >&2 echo "Option num_hyps=$2" + num_hyps=$2 + shift 2;; + -m|--model) + >&2 echo "Option model=$2" + comet_model=$2 + shift 2;; + -g|--num_gpus) + >&2 echo "Option num_gpus=$2" + num_gpus=$2 + shift 2;; + -h|--help) + help=1 + shift;; + --) + shift; break;; + *) + >&2 echo "Internal error!" ; exit 1 ;; + esac +done + +if [[ "$help" = "1" ]] +then +cat >&2 < wmt21.src +cat wmt21.src | perl -pe '\$_ = \$_ x 128' > wmt21.128.src +cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \ + -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out + +# run MBR with COMET +cat wmt21.128.out | ~/marian-dev/scripts/mbr/comet/comet_mbr.sh -m wmt20-comet-da.npz -n 128 -s wmt21.src -g 8 > wmt21.128.mbr.out +cat wmt21.128.mbr.out | cut -f 4 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text + +END +exit +fi + + +hyps_file=${hyps_file:-/dev/stdin} +out_file=${out_file:-/dev/stdout} +num_hyps=${num_hyps:-128} +comet_model=${comet_model:-wmt20-comet-da.npz} +num_gpus=${num_gpus:-8} + +script_path=$(dirname $0) +marian=$script_path/../../../build/marian + +comet_path=$(dirname $comet_model) +devices=$(seq 0 $(($num_gpus-1))) + +tmp=/tmp + +# create temporary files and delete them right after, use file descriptor instead +# (will free disk space after script ends, even when interrupted) +samples=$(mktemp $tmp/samples.XXXXXX) +exec 3>"$samples" +rm "$samples" +samples=/dev/fd/3 + +source=$(mktemp $tmp/source.XXXXXX) +exec 4>"$source" +rm "$source" +source=/dev/fd/4 + +source_embeddings=$(mktemp $tmp/source.embeddings.bin.XXXXXX) +exec 5>"$source_embeddings" +rm "$source_embeddings" +source_embeddings=/dev/fd/5 + +hyps_embeddings=$(mktemp $tmp/sample.embeddings.bin.XXXXXX) +exec 6>"$hyps_embeddings" +rm "$hyps_embeddings" +hyps_embeddings=/dev/fd/6 + +# done with creating temporary files + +lines_hyps=$(cat $hyps_file | tee $samples | wc -l) +lines_source=$(cat $source_file | tee $source | wc -l) + +>&2 echo "Computing source embeddings ($lines_source lines) with $comet_model" + +cat $source \ +| pv -ptel -s $lines_source \ +| $marian embed -m $comet_model -v $comet_path/roberta-vocab.spm \ + --like roberta -d $devices --fp16 --binary --quiet \ +> $source_embeddings + +>&2 echo "Computing sample embeddings ($lines_hyps lines, $num_hyps per sentence) with $comet_model" + +cat $samples \ +| pv -ptel -s $lines_hyps \ +| $marian embed -m $comet_model -v $comet_path/roberta-vocab.spm \ + --like roberta -d $devices --fp16 --binary --quiet \ +> $hyps_embeddings + +>&2 echo "Computing MBR scores" + +cat $samples \ +| pv -ptel -s $lines_hyps \ +| python $script_path/comet_mbr_with_embeddings.py \ + -m $comet_model -s $source_embeddings -t $hyps_embeddings \ + --num_source $lines_source --num_hyps $num_hyps \ + -d $devices --batch_size 128 --fp16 \ +> $out_file + +>&2 echo "Done" diff --git a/scripts/mbr/comet/comet_mbr_with_embeddings.py b/scripts/mbr/comet/comet_mbr_with_embeddings.py new file mode 100644 index 000000000..f14207af8 --- /dev/null +++ b/scripts/mbr/comet/comet_mbr_with_embeddings.py @@ -0,0 +1,125 @@ +import numpy as np +import cupy as cp +import sys +import argparse +from pathlib import Path + +parser = argparse.ArgumentParser(description='Apply MBR with COMET top layers') +parser.add_argument('-m', '--model', type=Path, help='COMET model path', required=True) +parser.add_argument('-s', '--source', type=Path, help='Source file embeddings', required=True) +parser.add_argument('-t', '--hyps', type=Path, help='Sample file embeddings', required=True) +parser.add_argument('--num_source', type=int, help='Number of sentence', required=True) +parser.add_argument('--num_hyps', type=int, help='Number of samples per sentence', required=True) +parser.add_argument('--fp16', help='Use fp16 for computation', action='store_true') +parser.add_argument('--batch_size', type=int, help='Batch size during MBR', default=32) +parser.add_argument('-d', '--devices', nargs='+', type=int, help="GPU device id to use", default=[0, 1, 2, 3, 4, 5, 6, 7]) +args = parser.parse_args() + + +model_path = args.model +src_emb_path = args.source +smp_emb_path = args.hyps + +num_sents = args.num_source +num_samps = args.num_hyps + +emb_size = 1024 + +compute_type=cp.float32 +if args.fp16: + compute_type=cp.float16 + +batch_size = args.batch_size +devices = args.devices + +sources = np.memmap(src_emb_path, mode='r', dtype=np.float32, shape=(num_sents, emb_size)) +samples = np.memmap(smp_emb_path, mode='r', dtype=np.float32, shape=(num_sents, num_samps, emb_size)) + +def mbr_decode_batch(pooler, mt, src, ref): + batch_size = mt.shape[0] + + diffRef = abs(mt - ref) + prodRef = mt * ref + + diffSrc = cp.repeat(abs(mt - src), repeats=num_samps, axis=-2); + prodSrc = cp.repeat(mt * src, repeats=num_samps, axis=-2); + + mt = cp.repeat(mt, repeats=num_samps, axis=-2) + ref = cp.repeat(ref, repeats=batch_size, axis=-3) + + emb = cp.concatenate([mt, ref, prodRef, diffRef, prodSrc, diffSrc], axis=-1) + + layer1 = cp.tanh(cp.dot(emb, pooler[0]["weight"]) + pooler[0]["bias"]) + layer2 = cp.tanh(cp.dot(layer1, pooler[1]["weight"]) + pooler[1]["bias"]) + comet = cp.dot(layer2, pooler[2]["weight"]) + pooler[2]["bias"] + + mbr_score = cp.reshape(cp.mean(comet, axis=-2), (batch_size,)) + + return mbr_score + + +def mbr_decode(pooler, i, batch_size=50): + sources_gpu = cp.asarray(sources[i, :], compute_type) + samples_gpu = cp.asarray(samples[i, :, :], compute_type) + + src = cp.reshape(sources_gpu, (1, 1, emb_size)) + mt = cp.reshape(samples_gpu, (num_samps, 1, emb_size)) + ref = cp.reshape(mt, (1, num_samps, emb_size)) + + batches = cp.array_split(mt, int(num_samps / batch_size)) + scores = [] + for batch in batches: + mbr_scores_batch = mbr_decode_batch(pooler, batch, src, ref) + scores.append(mbr_scores_batch) + + mbr_scores = cp.concatenate(scores, axis=-1) + best_index = cp.argmax(mbr_scores, axis=-1) + best_score = cp.max(mbr_scores, axis=-1) + + return best_index, best_score + +def consume(k): + j = 0 + candidates = [] + for line in sys.stdin: + line = line.rstrip() + candidates.append(line) + + if len(candidates) == num_samps: + best = best_gpu[k + j] + best_index = cp.asnumpy(best[0]) + best_score = cp.asnumpy(best[1]) + print(f"{k + j}\t{best_index}\t{best_score:.4f}\t{candidates[best_index]}") + candidates = [] + j += 1 + if j == step: + k += step + break + return k + +##################################################### + +model = np.load(model_path) + +poolers = [] +for id in devices: + with cp.cuda.Device(id): + pooler = [] + for i, layerNo in enumerate([0, 3, 6]): + w = cp.asarray(model[f"CometQEPooler->layers->at({layerNo})->as()->weight"], compute_type) + b = cp.asarray(model[f"CometQEPooler->layers->at({layerNo})->as()->bias"], compute_type) + pooler.append({"weight": w, "bias": b}) + poolers.append(pooler) + +step = batch_size +best_gpu = [] +k = 0 +for i in range(num_sents): + gpu_id = i % len(devices) + with cp.cuda.Device(devices[gpu_id]): + best_gpu.append(mbr_decode(poolers[gpu_id], i, batch_size=batch_size)) + if len(best_gpu) % step == 0: + k = consume(k) + +# get rest +k = consume(k) diff --git a/scripts/mbr/generic/explode_collapse.pl b/scripts/mbr/generic/explode_collapse.pl new file mode 100755 index 000000000..df1dbb085 --- /dev/null +++ b/scripts/mbr/generic/explode_collapse.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# Helper script that takes the sample file with N samples and M references (first M among N samples) +# and creates deduped(!) N' x M' pairs (N' is N after deduplication, same for M') for scoring. +# Creating the pairs is "exploding", deduping is "collapsing", hence the name. +# Includes ids so that the original order from before deduplication can be restored. + +my $N = $ARGV[0]; +my $R = $ARGV[1]; +$R = $N if not defined($R); + +sub explodeCollapse { + my $id = shift; + my @samples = @_; + + my %cnd; + foreach(@samples) { + $cnd{$_} = scalar keys %cnd if not exists($cnd{$_}); + } + + my @uniq = sort { $cnd{$a} <=> $cnd{$b} } keys %cnd; + foreach my $t (@uniq) { + my $c = 0; + foreach my $r (@uniq) { + last if($c >= $R); + # this outputs the pseudo-reference first! + printf("%d\t%d\t%d\t%s\t%s\n", $id, $cnd{$r}, $cnd{$t}, $r, $t); + $c++; + } + } +} + +my @samples; +my $id = 0; +while() { + chomp; + push(@samples, $_); + if(@samples == $N) { + explodeCollapse($id, @samples); + @samples = (); + $id++; + } +} diff --git a/scripts/mbr/generic/metrics/bleu.sh b/scripts/mbr/generic/metrics/bleu.sh new file mode 100755 index 000000000..e94d74d77 --- /dev/null +++ b/scripts/mbr/generic/metrics/bleu.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +parallel --cat -k -j32 --block 10M "sacrebleu <(cut -f 1 {}) < <(cut -f 2 {}) -b -w 4 -sl --format text --metrics bleu" diff --git a/scripts/mbr/generic/metrics/bleurt.sh b/scripts/mbr/generic/metrics/bleurt.sh new file mode 100755 index 000000000..a7095825d --- /dev/null +++ b/scripts/mbr/generic/metrics/bleurt.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +gpus=${1:-8} +scriptPath=$(dirname $0) +root=$scriptPath/../../../../. +marian=$root/build/marian +bleurt=$root/scripts/bleurt +devices=$(seq 0 $(($gpus-1))) + +# we reverse the input here since the scorer expects "hypref" but we output pseudo-references first +perl -F'\t' -ane 'chomp(@F); print "$F[1]\t$F[0]\n"' \ +| $marian evaluate -m $bleurt/bleurt-20.npz -v $bleurt/bleurt-vocab.{spm,spm} --like bleurt -d $devices --fp16 --quiet diff --git a/scripts/mbr/generic/metrics/chrf.sh b/scripts/mbr/generic/metrics/chrf.sh new file mode 100755 index 000000000..05a51de10 --- /dev/null +++ b/scripts/mbr/generic/metrics/chrf.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +parallel --cat -k -j32 --block 10M "sacrebleu <(cut -f 1 {}) < <(cut -f 2 {}) -b -w 4 -sl --format text --metrics chrf" diff --git a/scripts/mbr/generic/rescore.pl b/scripts/mbr/generic/rescore.pl new file mode 100755 index 000000000..7374056ad --- /dev/null +++ b/scripts/mbr/generic/rescore.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl + +# Helper scripts that takes the pairs created with explode_collapse.pl and the metric scores +# for each pair, computes MBR and by highest score. Restores original sample number (not order, due to sorting). +# Grepping for "^BEST:" will result in a file with as many hypotheses as orignal input sentences in the right order. + +my $N = $ARGV[0]; +my $R = $ARGV[1]; +open(IDS, "cat < $ARGV[2] |"); +open(SCORES, "cat < $ARGV[3] |"); + +$| = 1; + +sub score { + my $samples = shift; + my $scores = shift; + + my %cnd; + foreach(@$samples) { + $cnd{$_} = scalar keys %cnd if not exists($cnd{$_}); + } + + my @scored; + foreach my $t (@$samples) { + my $sum = 0; + my $tid = $cnd{$t}; + my $c = 0; + foreach my $r (@$samples) { + my $rid = $cnd{$r}; + if(exists($scores->{$tid}->{$rid}) and $c < $R) { + $sum += $scores->{$tid}->{$rid}; + $c++; + } + } + push(@scored, [$sum / $c, $t]); + } + my ($best, @rest) = sort { $b->[0] <=> $a->[0] } @scored; + printf("BEST\t%.4f\t%s\n", @$best); + printf("REST\t%.4f\t%s\n", @$_) foreach(@rest); +} + +my $samples = []; +my $scores = {}; +my $id1 = 0; +while() { + chomp; + push(@$samples, $_); + if(@$samples == $N) { + my ($ids, $score); + while(($ids = ) and ($score = )) { + chomp($ids, $score); + my($id2, $r, $t) = split(/\t/, $ids); + if($id1 == $id2) { + $scores->{$t}->{$r} = $score; + } else { + score($samples, $scores); + $samples = []; + $scores = {}; + $scores->{$t}->{$r} = $score; + last; + } + } + $id1++; + } +} +score($samples, $scores); + +close(SCORES) diff --git a/scripts/mbr/generic/stupid_mbr.sh b/scripts/mbr/generic/stupid_mbr.sh new file mode 100755 index 000000000..b19c0d0ae --- /dev/null +++ b/scripts/mbr/generic/stupid_mbr.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +if [[ "$1" = "--help" ]] +then +cat >&2 < wmt21.128.src +cat wmt21.128.src | ~/marian-dev/build/marian-decoder -m translation-model.npz \ + -v translation-model-vocab.{spm,spm} -b1 --mini-batch 32 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 256 --max-length-crop -d all --output-sampling > wmt21.128.out + +# run MBR, here with ChrF +cat wmt21.128.out | ~/marian-dev/scripts/mbr/generic/stupid_mbr.sh 128 128 chrf > wmt21.128.sorted.out + +# select the top translation according to ChrF MBR and evaluate result +cat wmt21.128.sorted.out | grep ^BEST | cut -f 3 | sacrebleu -t wmt21 -l en-de --metrics bleu chrf -w 2 --format text + +END +exit +fi + +num_samples=${1:-128} +num_references=${2:-$num_samples} +metric=${3:-bleu} +gpus=${4:-8} + +scriptPath=$(dirname $0) +tmp=$(mktemp -d) + +cat \ +| tee >(wc -l > $tmp/lines_input) \ +| pigz > $tmp/input.txt.gz + +lines_input=$(cat $tmp/lines_input) + +>&2 echo "Computing $metric scores" + +pigz -dc $tmp/input.txt.gz \ +| pv -ptel -s $lines_input \ +| perl $scriptPath/explode_collapse.pl $num_samples $num_references 2>/dev/null \ +| tee >(cut -f 1,2,3 > $tmp/ids) \ +| cut -f 4,5 \ +| $scriptPath/metrics/$metric.sh $gpus \ +> $tmp/scores + +>&2 echo "Computing MBR scores" + +pigz -dc $tmp/input.txt.gz \ +| pv -ptel -s $lines_input \ +| perl $scriptPath/rescore.pl $num_samples $num_references $tmp/ids $tmp/scores + +rm -rf $tmp +>&2 echo "Done" diff --git a/scripts/metrics/.gitignore b/scripts/metrics/.gitignore new file mode 100644 index 000000000..5d66dfcd9 --- /dev/null +++ b/scripts/metrics/.gitignore @@ -0,0 +1,2 @@ +bins/ +tmp.* \ No newline at end of file diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile new file mode 100644 index 000000000..4641e6571 --- /dev/null +++ b/scripts/metrics/Dockerfile @@ -0,0 +1,43 @@ +FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 + +LABEL description="Marian image - Ubuntu 20.04" + +ARG DEBIAN_FRONTEND=noninteractive +ARG NCPU=24 +ARG MARIAN_REPO="https://github.com/marian-nmt/marian-dev" +ARG MARIAN_BRANCH=master + +RUN apt-get update \ + && apt-get install -y wget apt-utils python3-pip git cmake build-essential \ + intel-mkl openmpi-common openmpi-bin libopenmpi-dev pkg-config \ + && apt-get clean + +RUN ln -sf /usr/bin/python3 /usr/bin/python && \ + ln -sf /usr/bin/pip3 /usr/bin/pip + +# install unbabel-comet (requires pytorch) and bleurt (requires tensorflow and cudnn) +# note: unabel-comet 2.x is broken use 1.x. requires numpy < 1.24 +RUN pip install --upgrade pip \ + && pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \ + && pip install sacrebleu unbabel-comet==1.1.3 numpy==1.23.5 nvidia-cudnn-cu11==8.6.0.163 git+https://github.com/google-research/bleurt.git \ + && rm -rf ~/.cache/pip/ + +# Install sentencepiece +RUN pip3 uninstall -y sentencepiece && \ + mkdir -p src && \ + cd src && \ + git clone https://github.com/marian-nmt/sentencepiece && \ + cd sentencepiece && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release .. && \ + make -j install && \ + cd ../python && \ + python3 setup.py install && \ + cd ../../.. && \ + rm -rf src + +RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \ + && mkdir /marian/build && cd /marian/build \ + && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=off -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off \ + && make -j $NCPU && cp -v marian spm_encode spm_decode /usr/bin/ \ diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md new file mode 100644 index 000000000..4d04c20b7 --- /dev/null +++ b/scripts/metrics/README.md @@ -0,0 +1,36 @@ +# Marian Evaluate +The main script is `compare.sh`, however it needs to be run in an environment where all three -- marian, unbabel-comet(pytorch), and bleurt(tensorflow) are available. +Hence, 1) we create a docker container with all the necessary libs. + and 2) run compare.sh inside the docker environment + +## Setup: build docker image + +```bash +./setup.sh +``` + +## Run compare.sh in docker container + +```bash +./docker-run.sh +``` +The `docker-run.sh` script mounts cache directory from the host to container. +The necessary files (weights and vocabularies) will be automatically downloaded and cached for unbabel-comet and Bleurt metrics. +However, for `marian-score.sh` expects the cache to be prepared under `$HOME/.cache/marian/metrics`. +The structure/format of the cache directory for marian-score.sh looks as follows: +```bash +/home/$USER/.cache/marian/metrics/ +├── bleurt20-ref +│ ├── bleurt-20.model.npz +│ ├── bleurt.vocab.spm +├── comet20-da-src +│ ├── comet20-qe-da.model.npz +│ └── roberta.vocab.spm +└── comet20-da-src+ref + ├── comet20-da.model.npz + └── roberta.vocab.spm +``` +Each metric subdir should have a `*model.npz` and a `*vocab.spm` files, and the name of metric directory should end with `-src|-qe|-ref|-src+ref` suffix to indicate the category of metric. + +> TODO: Upload Marian compatible comet and bleurt models to public blob storage and modify script to automatically download + diff --git a/scripts/metrics/compare.sh b/scripts/metrics/compare.sh new file mode 100755 index 000000000..902258863 --- /dev/null +++ b/scripts/metrics/compare.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +export PATH=$MYDIR:$PATH + +log() { + echo -e "\e[1;32m[$(date '+%Y-%m-%d %H:%M:%S')]\e[0m $@" >&2 +} + +get_sacrebleu_names(){ + # using sacrebleu to get the list of systems + testset=wmt21/systems + while read line; do + pair=$(cut -f1 -d':' <<< $line) + refs=() + mts=() + while read name; do + # skip if name starts with $pair or src or docid + if [[ $name == $pair* || $name == src || $name == docid || $name == origlang ]]; then + continue + fi + if [[ $name == ref* ]]; then + refs+=($name) + else + mts+=($name) + fi + done < <(sed 's/,//g;s/ /\n/g' <<< $line) + + # flatten: ref x mt + for ref in ${refs[@]}; do + for mt in ${mts[@]}; do + echo -e "$testset\t$pair\t$ref\t$mt" + done + done + done < <(sacrebleu -t $testset --list) +} + +unbabel_score(){ + local metric=$1 + local prefix=$2 + log "Running $metric" + local batch_size=64 + comet-score --batch_size $batch_size --model $metric -s $prefix.src -r $prefix.ref -t $prefix.mt \ + | awk -F '[:\t]' 'NF==4{print $NF}' +} + + +bleurt_score() { + local metric_name=$1 + local prefix=$2 + [[ $metric_name == "BLEURT-20" ]] || { + log "ERROR: BLEURT-20 is the only supported metric; given: $metric_name" + exit 1 + } + local cache_dir=$HOME/.cache/bleurt + local metric_path=$cache_dir/$metric_name + [[ -f $metric_path/._OK ]] || { + log "BLEURT model not found in $HOME/.cache/bleurt .. Downloading" + mkdir -p $cache_dir + rm -rf $metric_path.zip # remove incomplete file + wget https://storage.googleapis.com/bleurt-oss-21/$metric_name.zip -P $cache_dir \ + && unzip $metric_path.zip -d $cache_dir/ && touch $metric_path/._OK + } + + # to check if cuda libs are configured and GPU is available + # python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" + export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH + python -m bleurt.score_files --bleurt_checkpoint=$metric_path \ + --candidate_file=$prefix.mt --reference_file=$prefix.ref \ + --bleurt_batch_size 64 2> /dev/null +} + +marian_score() { + local metric=$1 + local prefix=$2 + case $metric in + wmt20-comet-qe-da) metric="comet20-da-src" ;; + wmt20-comet-da) metric="comet20-da-src+ref" ;; + BLEURT-20) metric="bleurt20-ref" ;; + *) echo "Unknown metric $metric"; exit 1;; + esac + marian-score.sh -d '0' -n $metric --src $prefix.src --ref $prefix.ref --mt $prefix.mt --seg +} + + +main() { + cd $MYDIR + local metric_names=(BLEURT-20 wmt20-comet-da wmt20-comet-qe-da) + export CUDA_VISIBLE_DEVICES=0 + local max_tests=10 + local max_lines=100 # in each testset + while IFS=$'\t' read tset pair ref mt; do + for mn in ${metric_names[@]}; do + log "Comparing >> $mn << on $tset $pair $ref $mt" + local data=$(sacrebleu -t $tset -l $pair --echo src ref $mt) + local tmp_pref=tmp.testset + rm -rf $tmp_pref.{src,ref,mt} + cut -f1 <<< "$data" | head -n $max_lines > $tmp_pref.src + cut -f2 <<< "$data" | head -n $max_lines > $tmp_pref.ref + cut -f3 <<< "$data" | head -n $max_lines > $tmp_pref.mt + if [[ $mn =~ BLEURT* ]]; then + local orig_out=$(bleurt_score $mn $tmp_pref) + else + local orig_out=$(unbabel_score $mn $tmp_pref 2> /dev/null) + fi + local marian_out=$(marian_score $mn $tmp_pref) + paste <(echo "$marian_out") <(echo "$orig_out") \ + | awk -F '\t' -v OFS='\t' -v mn=$mn \ + 'BEGIN {tot=0.0} {diff=sqrt(($1-$2)^2); tot+=diff; print diff,$0} + END {printf "\n===Avg diff in %s: %f===\n\n", mn, tot/NR}' + #TODO1: extract averages and write to a report file + #TODO2: benchmark speeds + done + done < <(get_sacrebleu_names | head -n $max_tests) +} + +main "$@" \ No newline at end of file diff --git a/scripts/metrics/docker-run.sh b/scripts/metrics/docker-run.sh new file mode 100755 index 000000000..c379c4415 --- /dev/null +++ b/scripts/metrics/docker-run.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd $MYDIR + +IMAGE="marian-dev" + +VISIBLE_GPUS="1" # exlcude 0 for now; run on single GPU + +MOUNTS="-v $PWD:$PWD" +for cache in .sacrebleu .cache/{marian,torch,huggingface,bleurt}; do + MOUNTS+=" -v $HOME/$cache:/root/$cache" +done + + +cmd="docker run --rm -i $MOUNTS --gpus "\"device=$VISIBLE_GPUS\"" -t $IMAGE" + +# uncomment for an interactive shell +# $cmd bash + +$cmd $PWD/compare.sh $@ diff --git a/scripts/metrics/marian-score.sh b/scripts/metrics/marian-score.sh new file mode 100755 index 000000000..873ef5921 --- /dev/null +++ b/scripts/metrics/marian-score.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -eu + +MYDIR=$(realpath $(dirname ${BASH_SOURCE[0]})) + + +METRICS_CACHE=$HOME/.cache/marian/metrics + +log() { + echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $@" >&2 +} + +which marian > /dev/null || { + log "marian not found in PATH. Please add marian binary to \$PATH and rerun" + exit 2 +} + +metric_name= +src_file= +ref_file= +hyp_file= +is_seg= +debug_mode= +batch_size=32 +pool_size=10 +max_length=256 +devices=0 +workspace=-4000 + +usage() { + log " ${BASH_SOURCE##*/} -n METRIC -m HYP [-s SRC] [-r REF] [-d DEVICES] [--seg] [--debug] [-h|--help] + +Args: + -n|--name|--metric NAME Metric name; required. See below for details. + -m|--mt|--hyp FILE MT hypothesis, required for all metrics. + -s|--src FILE Source file, required for source based metrics. + -r|--ref FILE Reference file, required for reference based metrics. + -d|--devices DEV IDs of GPU devices to use. Use quoted string to pass multiple values. Default: '$devices' + --seg Output segment-level scores. Default: print only the corpus-level score (mean of segment scores) + --debug Enable verbose mode (default is quiet) + -h|--help Print this help message + +Metric name (-n|--name) shuld be a subdir name under $METRICS_CACHE. +The metric name should have a suffix (-src|-qe|-ref|-src+ref) indicating the type of metric: + *-src|*-qe Source-based metric and requires --src arg, e.g., comet20-src or comet20-da-qe + *-ref Reference-based metric and requires --ref arg, e.g., bleurt20-ref + *-src+ref Both source and reference based and requires --src and --ref args e.g., comet20-src+ref +" +} + +while [[ $# -gt 0 ]]; do + case $1 in + -s|--src) src_file=$2; shift 2;; + -r|--ref) ref_file=$2; shift 2;; + -m|--mt|--hyp) hyp_file=$2; shift 2;; + -n|--name|--metric) metric_name=$2; shift 2;; + -d|--devices) devices=$2; shift 2;; + --seg) is_seg=1; shift 1;; + --debug) debug_mode=1; shift 1;; + -h|--help) usage; exit 0;; + *) log "ERROR: unknown option $1"; usage; exit 1;; + esac +done + +[[ -n $metric_name ]] || { log "ERROR: metric_name=$metric_name name not provided"; usage; exit 1; } +[[ -e $hyp_file ]] || { log "ERROR: hyp file not provided"; usage; exit 1; } + +metric_dir=$METRICS_CACHE/$metric_name +checkpoint=$(echo $metric_dir/*model.npz) # file model.npz or .model.npz +vocab=$(echo $metric_dir/*vocab.spm) +[[ -f $checkpoint && -f $vocab ]] || { + log "ERROR: metric $metric_name is not valid. See ls $METRICS_CACHE/$metric_name/{*model.npz,*vocab.spm}" + exit 1 +} + +# args common to all models +cmd="marian evaluate -w -4000" +[[ -n $devices ]] && cmd+=" -d $devices" +[[ -n $debug_mode ]] || cmd+=" --quiet" +cmd+=" -m $checkpoint --max-length $max_length --max-length-crop --mini-batch $batch_size --maxi-batch $pool_size -t stdin --tsv" +input= # to be filled later + + +check_file(){ + local name=$1 + local file=$2 + [[ -e $file ]] || { log "ERROR: $name file $file does not exist"; exit 1; } + [[ -s $file ]] || { log "ERROR: $name file $file is empty"; exit 1; } +} + +metric_type=${metric_name##*-} # suffix expected: src, ref, src+ref +case $metric_type in + src|qe) + # two sequences: src, hyp + check_file src $src_file + cmd+=" --like comet-qe -v $vocab $vocab" + input="paste $src_file $hyp_file" + ;; + ref) + check_file ref $ref_file + # two sequences: ref, hyp + cmd+=" --like bleurt -v $vocab $vocab" + input="paste $ref_file $hyp_file" + ;; + src+ref) + # three sequences: src, hyp, ref; three vocabularies + check_file src $src_file + check_file ref $ref_file + cmd+=" --like comet -v $vocab $vocab $vocab" + input="paste $src_file $hyp_file $ref_file" + ;; + *) + log "ERROR: $metric_name is not valid. Valid metrics have suffix '-{src|qe|ref|src+ref}'" + exit 3 + ;; +esac + +if [[ -z $is_seg ]]; then + cmd+=" --average only"; +fi +pipeline="$input | $cmd | cut -f1 -d' '" + +# mean (default) or segment-level scores + +log "Running: $pipeline" +eval $pipeline diff --git a/scripts/metrics/setup.sh b/scripts/metrics/setup.sh new file mode 100755 index 000000000..df16563a6 --- /dev/null +++ b/scripts/metrics/setup.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd $MYDIR + +#SSH_KEY=$HOME/.ssh/id_rsa # for git clone inside docker build +IMAGE=marian-dev +echo "Building docker image $IMAGE" +#DOCKER_BUILDKIT=1 docker build --ssh default=$SSH_KEY . -f Dockerfile -t $IMAGE +DOCKER_BUILDKIT=1 docker build . -f Dockerfile -t $IMAGE + + +# Optional build args: +# --build-arg MARIAN_COMMIT=master \ +# --build-arg MARIAN_REPO=https://github.com/marian-nmt/marian-dev.git \ +# --build-arg NCPUS=16 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d1f119335..77c455946 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -61,6 +61,7 @@ set(MARIAN_SOURCES tensors/cpu/tensor_operators.cpp tensors/cpu/integer_common.cpp tensors/cpu/fbgemm/packed_gemm.cpp + tensors/gpu/gpu_info.cpp graph/expression_graph.cpp graph/expression_operators.cpp diff --git a/src/command/marian_evaluator.cpp b/src/command/marian_evaluator.cpp new file mode 100644 index 000000000..bd9617b54 --- /dev/null +++ b/src/command/marian_evaluator.cpp @@ -0,0 +1,15 @@ +#include "marian.h" + +#include "models/model_task.h" +#include "evaluator/evaluator.h" +#include "common/timer.h" + +int main(int argc, char** argv) { + using namespace marian; + + // @TODO: add mode evaluating + auto options = parseOptions(argc, argv, cli::mode::evaluating); + New>(options)->run(); + + return 0; +} diff --git a/src/command/marian_main.cpp b/src/command/marian_main.cpp index dcdea4662..e838fe808 100644 --- a/src/command/marian_main.cpp +++ b/src/command/marian_main.cpp @@ -28,6 +28,9 @@ #define main mainEmbedder #include "marian_embedder.cpp" #undef main +#define main mainEvaluator +#include "marian_evaluator.cpp" +#undef main #define main mainVocab #include "marian_vocab.cpp" #undef main @@ -49,6 +52,7 @@ int main(int argc, char** argv) { else if(cmd == "decode") return mainDecoder(argc, argv); else if (cmd == "score") return mainScorer(argc, argv); else if (cmd == "embed") return mainEmbedder(argc, argv); + else if (cmd == "evaluate") return mainEvaluator(argc, argv); else if (cmd == "vocab") return mainVocab(argc, argv); else if (cmd == "convert") return mainConv(argc, argv); std::cerr << "Command must be train, decode, score, embed, vocab, or convert." << std::endl; diff --git a/src/common/config.cpp b/src/common/config.cpp index a1c4ed5ac..efdd29c12 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -247,48 +247,23 @@ std::vector Config::getDevices(Ptr options, } // GPU: devices[] are interpreted in a more complex way else { - size_t numDevices = options->get("num-devices", 0); std::vector deviceNos; - for(auto d : devicesArg) - deviceNos.push_back((size_t)std::stoull(d)); + for(auto d : devicesArg) { + if(d == "all") { + // on encoutering "all" overwrite all given ids with all available ids + size_t numDevices = gpu::availableDevices(); + deviceNos.resize(numDevices); + std::iota(deviceNos.begin(), deviceNos.end(), 0); + break; + } else { + deviceNos.push_back((size_t)std::stoull(d)); + } + } - // if devices[] is empty then default to 0..N-1, where N = numDevices or 1 if (deviceNos.empty()) { - if(numDevices == 0) // if neither is given, then we default to 1 device, which is device[0] - numDevices = 1; - for(size_t i = 0; i < numDevices; ++i) // default to 0..N-1 - deviceNos.push_back(i); - } - // devices[] is not empty - else if(numDevices == 0) // if device list then num devices defaults to list size - numDevices = deviceNos.size(); // default to #devices - - // If multiple MPI processes then we can either have one set of devices shared across all - // MPI-processes, or the full list across all MPI processes concatenated. E.g. --num-devices 1 - // --devices 0 2 4 5 means 4 processes using devices 0, 2, 4, and 5, respectively. In that - // case, we cut out and return our own slice. In the above example, for MPI process 1, we would - // return {2}. - - // special-case the error message (also caught indirectly below, but with a msg that is - // confusing when one does not run multi-node) - if(numMPIProcesses == 1) - // same as requiring numPerMPIProcessDeviceNos == 1 - // @TODO: improve logging message as devices[] and numDevices are not informative for the user - ABORT_IF(numDevices != deviceNos.size(), "devices[] size must be equal to numDevices"); - - // how many lists concatenated in devices[]? Allowed is either 1 (=shared) or numWorkers - size_t numPerMPIProcessDeviceNos = deviceNos.size() / numDevices; - // @TODO: improve logging message as devices[] and numDevices are not informative for the user - ABORT_IF(numDevices * numPerMPIProcessDeviceNos != deviceNos.size(), - "devices[] size must be equal to or a multiple of numDevices"); // (check that it is a multiple) - - // if multiple concatenated lists are given, slice out the one for myMPIRank - if(numPerMPIProcessDeviceNos != 1) { - ABORT_IF(numPerMPIProcessDeviceNos != numMPIProcesses, - "devices[] must either list a shared set of devices, or one set per MPI process"); - deviceNos.erase(deviceNos.begin(), deviceNos.begin() + myMPIRank * numDevices); - deviceNos.resize(numDevices); + deviceNos.push_back(0); } + // form the final vector for(auto d : deviceNos) devices.push_back({ d, DeviceType::gpu }); diff --git a/src/common/config.h b/src/common/config.h index c5a016e68..c22d7415e 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -14,12 +14,17 @@ namespace marian { +namespace gpu { + // defined in src/tensors/gpu/gpu_info.cpp + size_t availableDevices(); +} + // TODO: Finally refactorize Config, Options, ConfigParser and ConfigValidator // classes. // // TODO: The problem is that there are many config classes in here, plus // "configuration" can refer to the high-level concept of the entire program's -// configuration, and/or any of its representations. Avoidthe term "config" and +// configuration, and/or any of its representations. Avoid the term "config" and // always qualify it what kind of config, e.g. new Options instance. // // TODO: What is not clear is the different config levels as there are classes diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index d70048fe9..0d8021bf1 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -94,6 +94,9 @@ ConfigParser::ConfigParser(cli::mode mode) case cli::mode::embedding: addOptionsEmbedding(cli_); break; + case cli::mode::evaluating: + addOptionsEvaluating(cli_); + break; default: ABORT("wrong CLI mode"); break; @@ -563,7 +566,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps " "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations") ->implicit_val("100 10 3.0f"); - cli.add("--fp16-fallback-to-fp32", + cli.add("--fp16-fallback-to-fp32", "If fp16 training diverges and throws try to continue training with fp32 precision"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " @@ -824,7 +827,7 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { } void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) { - auto previous_group = cli.switchGroup("Scorer options"); + auto previous_group = cli.switchGroup("Embedder options"); // clang-format off cli.add("--no-reload", @@ -856,17 +859,122 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) { "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16", {"float32"}); + cli.add("--like", + "Set good defaults for supported embedder types: roberta (works for all COMET flavors)"); + + // Short-cut for Unbabel comet-qe metric + cli.alias("like", "roberta", [](YAML::Node& config) { + // Model options + config["train-sets"] = std::vector({"stdin"}); + config["input-types"] = std::vector({"sequence"}); + config["max-length"] = 512; + config["max-length-crop"] = true; + config["mini-batch"] = 32; + config["maxi-batch"] = 100; + config["maxi-batch-sort"] = "src"; + config["workspace"] = -4000; + config["devices"] = std::vector({"all"}); + }); + + cli.switchGroup(previous_group); + // clang-format on +} + +void ConfigParser::addOptionsEvaluating(cli::CLIWrapper& cli) { + auto previous_group = cli.switchGroup("Evaluator options"); + + cli.add("--no-reload", + "Do not load existing model specified in --model arg"); + // @TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice + cli.add>("--train-sets,-t", + "Paths to corpora to be scored: source target"); + cli.add("--output,-o", + "Path to output file, stdout by default", + "stdout"); + cli.add>("--vocabs,-v", + "Paths to vocabulary files have to correspond to --train-sets. " + "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. " + "If these files do not exists they are created"); + cli.add("--width", + "Floating point precision of metric outputs", + 4); + cli.add("--average", + "Report average of all sentence-level values. By default the average is appended as the last line. " + "Alternatively, we can provide `--average only` which supresses other values.", + "skip")->implicit_val("append"); + + addSuboptionsInputLength(cli); + addSuboptionsTSV(cli); + addSuboptionsDevices(cli); + addSuboptionsBatching(cli); + + cli.add("--fp16", + "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); + cli.add>("--precision", + "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16", + {"float32"}); + + cli.add("--like", + "Set good defaults for supported metric types: comet-qe, comet, bleurt"); + + // Short-cut for Unbabel comet-qe metric + cli.alias("like", "comet-qe", [](YAML::Node& config) { + // Model options + config["train-sets"] = std::vector({"stdin"}); + config["tsv"] = true; + config["tsv-fields"] = 2; + config["input-types"] = std::vector({"sequence", "sequence"}); + config["max-length"] = 512; + config["max-length-crop"] = true; + config["mini-batch"] = 32; + config["maxi-batch"] = 100; + config["maxi-batch-sort"] = "src"; + config["workspace"] = -4000; + config["devices"] = std::vector({"all"}); + }); + + // Short-cut for Unbabel comet metric + cli.alias("like", "comet", [cli](YAML::Node& config) { + // Model options + config["train-sets"] = std::vector({"stdin"}); + config["tsv"] = true; + config["tsv-fields"] = 3; + config["input-types"] = std::vector({"sequence", "sequence", "sequence"}); + config["max-length"] = 512; + config["max-length-crop"] = true; + config["mini-batch"] = 32; + config["maxi-batch"] = 100; + config["maxi-batch-sort"] = "src"; + config["workspace"] = -4000; + config["devices"] = std::vector({"all"}); + }); + + // Short-cut for Google bleurt metric + cli.alias("like", "bleurt", [](YAML::Node& config) { + // Model options + config["train-sets"] = std::vector({"stdin"}); + config["tsv"] = true; + config["tsv-fields"] = 2; + config["input-types"] = std::vector({"sequence", "sequence"}); + config["max-length"] = 512; + config["max-length-crop"] = true; + config["mini-batch"] = 32; + config["maxi-batch"] = 100; + config["maxi-batch-sort"] = "src"; + config["workspace"] = -4000; + config["devices"] = std::vector({"all"}); + }); + cli.switchGroup(previous_group); // clang-format on } + void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) { // clang-format off cli.add>("--devices,-d", - "Specifies GPU ID(s) to use for training. Defaults to 0..num-devices-1", + "Specifies GPU ID(s) (e.g. '0 1 2 3' or 'all') to use for training. Defaults to GPU ID 0", {"0"}); - cli.add("--num-devices", - "Number of GPUs to use for this process. Defaults to length(devices) or 1"); #ifdef USE_NCCL if(mode_ == cli::mode::training) { cli.add("--no-nccl", @@ -1093,10 +1201,6 @@ Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) cli_.updateConfig(config, cli::OptionPriority::CommandLine, "A shortcut for STDIN failed."); } - if(doValidate) { - ConfigValidator(config_).validateOptions(mode_); - } - // remove extra config files from the config to avoid redundancy config_.remove("config"); @@ -1109,6 +1213,10 @@ Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) cli_.parseAliases(); } + if(doValidate) { // validate before options are dumped and we exit + ConfigValidator(config_, true).validateOptions(mode_); + } + bool minimal = (dumpMode == "minimal" || dumpMode == "expand"); std::cout << cli_.dumpConfig(minimal) << std::endl; exit(0); @@ -1186,6 +1294,10 @@ Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) #endif cli_.parseAliases(); + if(doValidate) { // validate the options after aliases are expanded + ConfigValidator(config_).validateOptions(mode_); + } + auto opts = New(); opts->merge(Config(*this).get()); return opts; diff --git a/src/common/config_parser.h b/src/common/config_parser.h index 18b6eccb7..617b86e5a 100644 --- a/src/common/config_parser.h +++ b/src/common/config_parser.h @@ -14,7 +14,7 @@ namespace marian { namespace cli { -enum struct mode { training, translation, scoring, server, embedding }; +enum struct mode { training, translation, scoring, server, embedding, evaluating }; } // namespace cli /** @@ -130,6 +130,7 @@ class ConfigParser { void addOptionsTranslation(cli::CLIWrapper&); void addOptionsScoring(cli::CLIWrapper&); void addOptionsEmbedding(cli::CLIWrapper&); + void addOptionsEvaluating(cli::CLIWrapper&); void addAliases(cli::CLIWrapper&); diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp index 5563b240d..1b31b96a2 100644 --- a/src/common/config_validator.cpp +++ b/src/common/config_validator.cpp @@ -17,6 +17,10 @@ ConfigValidator::ConfigValidator(const YAML::Node& config) dumpConfigOnly_(config["dump-config"] && !config["dump-config"].as().empty() && config["dump-config"].as() != "false") {} +ConfigValidator::ConfigValidator(const YAML::Node& config, bool dumpConfigOnly) + : config_(config), + dumpConfigOnly_(dumpConfigOnly) {} + ConfigValidator::~ConfigValidator() {} void ConfigValidator::validateOptions(cli::mode mode) const { @@ -33,6 +37,10 @@ void ConfigValidator::validateOptions(cli::mode mode) const { validateOptionsParallelData(); validateOptionsScoring(); break; + case cli::mode::evaluating: + validateOptionsParallelData(); + validateOptionsScoring(); + break; case cli::mode::training: validateOptionsParallelData(); validateOptionsTraining(); @@ -49,9 +57,13 @@ void ConfigValidator::validateOptions(cli::mode mode) const { void ConfigValidator::validateOptionsTranslation() const { auto models = get>("models"); - auto configs = get>("config"); + bool no_configs = true; + if(has("config")) { + auto configs = get>("config"); + no_configs = configs.empty(); + } - ABORT_IF(models.empty() && configs.empty(), + ABORT_IF(models.empty() && no_configs, "You need to provide at least one model file or a config file"); #ifdef COMPILE_CPU @@ -195,8 +207,8 @@ void ConfigValidator::validateDevices(cli::mode /*mode*/) const { std::string help; // valid strings: '0', '0 1 2 3', '3 2 0 1' - pattern = "[0-9]+( *[0-9]+)*"; - help = "Supported formats: '0 1 2 3'"; + pattern = "([0-9]+|all)( *([0-9]+|all))*"; + help = "Supported formats: '0 1 2 3' or 'all'"; ABORT_IF(!regex::regex_match(devices, pattern), "the argument '{}' for option '--devices' is invalid. {}", diff --git a/src/common/config_validator.h b/src/common/config_validator.h index 0e73a9e39..e5742194c 100644 --- a/src/common/config_validator.h +++ b/src/common/config_validator.h @@ -10,13 +10,14 @@ class ConfigValidator { const YAML::Node& config_; bool has(const std::string& key) const; + template T get(const std::string& key) const { return config_[key].as(); } - // The option --dump-config is used, so alleviate some constraints, e.g. we don't want to require - // --train-sets or --vocabs + // When --dump-config is used, alleviate some constraints, for example, do not + // require --train-sets or --vocabs bool dumpConfigOnly_{false}; void validateOptionsTranslation() const; @@ -29,6 +30,7 @@ class ConfigValidator { public: ConfigValidator(const YAML::Node& config); + ConfigValidator(const YAML::Node& config, bool dumpConfigOnly); virtual ~ConfigValidator(); // Validate options according to the given mode. Abort on first validation error diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index a429ae2f3..5fbfe636b 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -61,7 +61,8 @@ CorpusBase::CorpusBase(const std::vector& paths, rightLeft_(options_->get("right-left")), prependZero_(options_->get("comet-prepend-zero", false)), tsv_(options_->get("tsv", false)), - tsvNumInputFields_(getNumberOfTSVInputFields(options)) { + tsvNumInputFields_(getNumberOfTSVInputFields(options)), + joinFields_(options_->get("input-join-fields", false)) { // TODO: support passing only one vocab file if we have fully-tied embeddings if(tsv_) { ABORT_IF(tsvNumInputFields_ != vocabs_.size(), @@ -87,7 +88,8 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) rightLeft_(options_->get("right-left")), prependZero_(options_->get("comet-prepend-zero", false)), tsv_(options_->get("tsv", false)), - tsvNumInputFields_(getNumberOfTSVInputFields(options)) { + tsvNumInputFields_(getNumberOfTSVInputFields(options)), + joinFields_(options_->get("input-join-fields", false)) { bool training = !translate; if(training) @@ -426,8 +428,12 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, auto inputTypes = options_->get>("input-types", {}); // empty list by default - if(prependZero_ && inputTypes[batchIndex] == "sequence") - words.insert(words.begin(), Word::fromWordIndex(0)); + // This handles adding starts symbols for COMET () and BERT/BLEURT ([CLS]) + bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0)); + if(prepend && inputTypes[batchIndex] == "sequence") { + auto prependedWord = Word::fromWordIndex(0); + words.insert(words.begin(), prependedWord); + } if(maxLengthCrop_ && words.size() > maxLength_) { words.resize(maxLength_); @@ -438,7 +444,12 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, if(rightLeft_) std::reverse(words.begin(), words.end() - 1); - tup.push_back(words); + // if true, the numeric indices get joined with the previous sentence, acts as a separator here + // @TODO: make this cleaner. + if(joinFields_) + tup.appendToBack(words); + else + tup.pushBack(words); } void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 123250d97..7a03414b4 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -72,7 +72,22 @@ class SentenceTupleImpl { * * @param words A vector of word indices. */ - void push_back(const Words& words) { tuple_.push_back(words); } + void pushBack(const Words& words) { tuple_.push_back(words); } + + /** + * @brief Appends mroe words to the last sentence of the tuple. + * + * @param words A vector of word indices. + */ + void appendToBack(const Words& words) { + if(tuple_.empty()) { + tuple_.push_back(words); + } else { + for(auto& w : words) { + tuple_.back().push_back(w); + } + } + } /** * @brief The size of the tuple, e.g. two for parallel data with a source and @@ -644,6 +659,9 @@ class CorpusBase : public DatasetBase& vec) { if(binary_) { outStrm_->write((char*)vec.data(), vec.size() * sizeof(float)); } else { - *outStrm_ << std::fixed << std::setprecision(4); + *outStrm_ << std::fixed << std::setprecision(width_); for(auto v : vec) *outStrm_ << v << " "; *outStrm_ << std::endl; } } +void AveragingVectorCollector::WriteVector(const std::vector& vec) { + if(!onlyLast_) + VectorCollector::WriteVector(vec); + + if(sum_.size() < vec.size()) + sum_.resize(vec.size()); + for(size_t i = 0; i < vec.size(); ++i) + sum_[i] += vec[i]; + count_++; +} + +void AveragingVectorCollector::WriteAverage() { + std::lock_guard lock(mutex_); + auto avg = sum_; + for(auto& val : avg) + val /= (float)count_; + VectorCollector::WriteVector(avg); +} + +Ptr VectorCollector::Create(Ptr options) { + std::string average = options->get("average", "skip"); + std::string output = options->get("output"); + size_t width = options->get("width", DEFAULT_WIDTH); + + Ptr collector; + if(average == "skip") + collector = New(output, /*binary=*/false, width); + else if(average == "append") + collector = New(output, /*binary=*/false, width, /*onlyLast=*/false); + else if(average == "only") + collector = New(output, /*binary=*/false, width, /*onlyLast=*/true); + else + ABORT("Unknown configuration for VectorCollector"); + + return collector; +} + } // namespace marian diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h index fc39ea6ec..3f1f91e0c 100644 --- a/src/embedder/vector_collector.h +++ b/src/embedder/vector_collector.h @@ -11,19 +11,24 @@ namespace marian { // This class manages multi-threaded writing of embedded vectors to stdout or an output file. // It will either output string versions of float vectors or binary equal length versions depending -// on its binary_ flag. +// on its binary flag. If binary=false, width can be used to set the number of decimal places. class VectorCollector { public: - VectorCollector(bool binary=false); - VectorCollector(std::string outFile, bool binary=false); + static const size_t DEFAULT_WIDTH = 4; + + VectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH); + VectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH); virtual ~VectorCollector() {} virtual void Write(long id, const std::vector& vec); + static Ptr Create(Ptr options); + protected: long nextId_{0}; UPtr outStrm_; bool binary_; // output binary floating point vectors if set + size_t width_{DEFAULT_WIDTH}; std::mutex mutex_; @@ -32,4 +37,30 @@ class VectorCollector { virtual void WriteVector(const std::vector& vec); }; + +// Add a running summation of vector elements and outputs the average vector on destruction. +// Can also be configured to omit line-by-line results. +class AveragingVectorCollector : public VectorCollector { +private: + std::vector sum_; + size_t count_{0}; + bool onlyLast_{false}; + +protected: + virtual void WriteVector(const std::vector& vec) override; + +public: + AveragingVectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH, bool onlyLast=false) + : VectorCollector(binary, width), onlyLast_(onlyLast) {} + + AveragingVectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH, bool onlyLast=false) + : VectorCollector(outFile, binary, width), onlyLast_(onlyLast) {} + + virtual ~AveragingVectorCollector() { + WriteAverage(); + } + + virtual void WriteAverage(); +}; + } // namespace marian diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h new file mode 100644 index 000000000..31fe00e87 --- /dev/null +++ b/src/evaluator/evaluator.h @@ -0,0 +1,155 @@ +#pragma once + +#include "marian.h" + +#include "common/config.h" +#include "common/options.h" +#include "data/batch_generator.h" +#include "data/corpus.h" +#include "data/corpus_nbest.h" +#include "models/costs.h" +#include "models/model_task.h" +#include "embedder/vector_collector.h" +#include "training/scheduler.h" +#include "training/validator.h" + +namespace marian { + +using namespace data; + +/* + * The tool is used to calculate metric score for various neural metrics. + * @TODO: add the string-based matrics that we have already implemented like bleu and chrf. + */ +class Evaluator { +private: + Ptr model_; + +public: + Evaluator(Ptr options) + : model_(createModelFromOptions(options, models::usage::evaluating)) {} + + void load(Ptr graph, const std::vector& items) { + model_->load(graph, items); + } + + void load(Ptr graph, const std::string& fileName) { + model_->load(graph, fileName); + } + + Expr build(Ptr graph, Ptr batch) { + auto evaluator = std::dynamic_pointer_cast(model_); + ABORT_IF(!evaluator, "Could not cast to EncoderPooler"); + return evaluator->apply(graph, batch, /*clearGraph=*/true)[0]; + } +}; + +/* + * Actual Evaluate task. @TODO: this should be simplified in the future. + */ +template +class Evaluate : public ModelTask { +private: + Ptr options_; + Ptr corpus_; + std::vector> graphs_; + std::vector> models_; + std::vector ioItems_; + +public: + Evaluate(Ptr options) : options_(options) { + options_ = options_->with("inference", true, + "shuffle", "none"); + + corpus_ = New(options_); + corpus_->prepare(); + + auto devices = Config::getDevices(options_); + + auto modelPath = options_->get("model"); + LOG(info, "Loading model from {}", modelPath); + ioItems_ = io::loadItems(modelPath); + + graphs_.resize(devices.size()); + models_.resize(devices.size()); + + ThreadPool pool(devices.size(), devices.size()); + for(size_t i = 0; i < devices.size(); ++i) { + pool.enqueue( + [=](size_t j) { + auto graph = New(true); + auto precison = options_->get>("precision", {"float32"}); + graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph + graph->setDevice(devices[j]); + graph->reserveWorkspaceMB(options_->get("workspace")); + + auto model = New(options_); + model->load(graph, ioItems_); + + models_[j] = model; + graphs_[j] = graph; + }, + i); + } + } + + void run() override { + LOG(info, "Evaluating"); + timer::Timer timer; + + auto batchGenerator = New>(corpus_, options_); + batchGenerator->prepare(); + + Ptr output = VectorCollector::Create(options_); + + size_t batchId = 0; + { + ThreadPool pool(graphs_.size(), graphs_.size()); + + for(auto batch : *batchGenerator) { + auto task = [=](size_t id) { + thread_local Ptr graph; + thread_local Ptr builder; + + if(!graph) { + graph = graphs_[id % graphs_.size()]; + builder = models_[id % graphs_.size()]; + } + + auto scores = builder->build(graph, batch); + graph->forward(); + + // handle copying from fp32 or fp16 scores correctly. + std::vector sentVectors; + if(scores->value_type() == Type::float32) { + scores->val()->get(sentVectors); + } else if (scores->value_type() == Type::float16) { + std::vector sentVectors16; + scores->val()->get(sentVectors16); + sentVectors.reserve(sentVectors16.size()); + for(auto& v: sentVectors16) + sentVectors.push_back(v); + } else { + ABORT("Unknown value type {}", scores->value_type()); + } + + // collect embedding vector per sentence. + // if we compute similarities this is only one similarity per sentence pair. + for(size_t i = 0; i < batch->size(); ++i) { + auto numScores = scores->shape()[-1]; + auto beg = i * numScores; + auto end = (i + 1) * numScores; + std::vector sentVector(sentVectors.begin() + beg, sentVectors.begin() + end); + output->Write((long)batch->getSentenceIds()[i], sentVector); + } + }; + + pool.enqueue(task, batchId++); + } + } + LOG(info, "Total time: {:.5f}s wall", timer.elapsed()); + } + +}; + +} // namespace marian diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index c928e8ce0..0ec6f7e67 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -582,10 +582,14 @@ Expr bdot_legacy(Expr a, Expr b, bool transA, bool transB, float scale) { Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // general version, MKL, CBlas or CUDA + std::vector nodes = { a, b, bias }; - int rows = a->shape().elements() / a->shape()[-1]; - Expr ones = a->graph()->ones({ rows, 1 }); - std::vector nodes = { a, b, bias, ones }; + auto graph = a->graph(); + if(!graph->isInference()) { + int rows = a->shape().elements() / a->shape()[-1]; + Expr ones = a->graph()->ones({ rows, 1 }, bias->value_type()); + nodes.push_back(ones); + } return Expression(nodes, transA, transB, scale); } diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 2c997d577..d35ca6fff 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -317,6 +317,8 @@ class AffineNodeOp : public NaryNodeOp { if(!isParameter(child(2)) && computeTypeC == Type::float16) computeTypeC = Type::float32; + ABORT_IF(children().size() != 4, "Did we lose the column of ones required for backprob of bias??"); + // We reduce bias gradients with a matrix multiply if(!transA_ && transB_) return { diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index 85c14f51b..93c6d9b33 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -191,8 +191,7 @@ Expr Embedding::applyIndices(const std::vector& embIdx, const Shape& // clang-format on if(options_->hasAndNotEmpty("embedding-vectors")) { auto embFiles = opt>("embedding-vectors"); - options->set( - "embFile", embFiles[batchIndex_], "normalization", opt("embedding-normalization")); + options->set("embFile", embFiles[batchIndex_], "normalization", opt("embedding-normalization")); } return New(graph_, options); } diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h index 035e6c51d..4f4838e48 100644 --- a/src/layers_new/attention.h +++ b/src/layers_new/attention.h @@ -178,7 +178,8 @@ static Ptr attentionFromOptions(Ptr graph, Ptr< // in the future we might add SingleHead or Additive or LSH-based as in Reformer if(selfAttentionType == "default") { int numHeads = options->get("transformer-heads"); - int modelDim = options->get("dim-emb"); + int modelDim = options->get("transformer-dim-model", options->get("dim-emb")); + float attentionDropoutProbability = options->get("transformer-dropout-attention", 0.f); return New>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability); diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index 51f2ef4e3..278758a96 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -130,10 +130,18 @@ struct Linear : public Layer, public IUnaryLayer { registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); } + Type outputType = x->value_type(); if(useBias) - return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed); + return marian::affine(x, + marian::cast(weight, outputType), + marian::cast(bias, outputType), + /*transA=*/false, + /*transB=*/transposed); else - return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed); + return marian::dot(x, + marian::cast(weight, outputType), + /*transA=*/false, + /*transB=*/transposed); } }; diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index 8776820ef..e808694de 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -126,7 +126,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - int modelDim = opt("dim-emb"); + int modelDim = opt("transformer-dim-model", opt("dim-emb")); int ffnDim = opt("transformer-dim-ffn"); if(isDecoder && opt("transformer-decoder-dim-ffn") != 0) ffnDim = opt("transformer-decoder-dim-ffn"); @@ -370,7 +370,8 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { registerLayer(preprocessor); // @TODO: factory to support different attention flavors? - rnn = New>(graph, opt("dim-emb"), opt("transformer-rnn-projection", false)); + int modelDim = opt("transformer-dim-model", opt("dim-emb")); + rnn = New>(graph, modelDim, opt("transformer-rnn-projection", false)); registerLayer(rnn); postprocessor = New( diff --git a/src/models/bleurt.h b/src/models/bleurt.h new file mode 100644 index 000000000..131b675a7 --- /dev/null +++ b/src/models/bleurt.h @@ -0,0 +1,217 @@ +#pragma once + +#include "layers_new/transformer.h" + +#include "models/encoder.h" +#include "layers/constructors.h" + +namespace marian { +namespace models { + +class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions { +public: + Expr embeddings; + + BleurtTypeEmbeddingLayer(Ptr graph, Ptr options) + : LayerWithOptions(graph, options) {} + + virtual ~BleurtTypeEmbeddingLayer() = default; + + Expr apply(Ptr subBatch) const { + int dimEmb = opt("dim-emb"); + int dimTypes = opt("bert-type-vocab-size", 2); + + // Embedding layer initialization should depend only on embedding size, hence fanIn=false + auto initFunc = inits::glorotUniform(/*fanIn=*/false, /*fanOut=*/true); // -> embedding vectors have roughly unit length + registerParameterLazy(embeddings, Shape({dimTypes, dimEmb}), initFunc); + + const auto& words = subBatch->data(); + const auto vocab = subBatch->vocab(); + + // Get word id of special symbols + Word sepId = vocab->getEosId(); + + int dimBatch = (int)subBatch->batchSize(); + int dimTime = (int)subBatch->batchWidth(); + const size_t maxSentPos = dimTypes; + + // create indices for BERT sentence embeddings A and B + std::vector sentenceIndices(dimBatch * dimTime, 0); // each word is either in sentence A or B + std::vector sentPos(dimBatch, 0); // initialize each batch entry with being A [0] + for(int i = 0; i < dimTime; ++i) { // advance word-wise + for(int j = 0; j < dimBatch; ++j) { // scan batch-wise + int k = i * dimBatch + j; + sentenceIndices[k] = sentPos[j]; // set to current sentence position for batch entry, max position 1. + if(words[k] == sepId && sentPos[j] < maxSentPos) { // if current word is a separator and not beyond range + sentPos[j]++; // then increase sentence position for batch entry (to B [1]) + } + } + } + + return reshape(rows(embeddings, sentenceIndices), {dimTime, dimBatch, dimEmb}); + } +}; + +struct BleurtEncoder final : public nn::TransformerEncoder { + Ptr eProj; + + BleurtEncoder(Ptr graph, + Ptr options) + : TransformerEncoder(graph, options) { + + eProj = New(graph, opt("transformer-dim-model")); + registerLayer(eProj); + + for(auto norm : allLayers()) + norm->eps = 1e-12f; // hard-coded as in original BLEURT model + } + + Expr apply(Expr input, Expr mask) const override { + auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] + + mask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] + auto binMask = mask; + mask = marian::nn::transposedLogMask(mask, opt("transformer-heads")); + + // apply positional embeddings to contextual input + output = positionEmbedding->apply(output); + + // apply dropout or layer-norm to embeddings if required + output = preprocessor->apply(output); + + // scale from 256 to 1152 + output = eProj->apply(output); + + // traverse the layers, use the same mask for each + for(auto layer : *layers) + output = layer->apply(output, mask); + + return output; + } +}; + +// Wrapper for backwards compatibility that uses current encoder/decoder framework +struct BleurtBatchEncoder final : public nn::LayerWithOptions, + public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings + public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? + Ptr typeEmbedding; + Ptr encoder; + + BleurtBatchEncoder(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options), + EncoderBase(graph, options) + { + typeEmbedding = New(graph, options); + registerLayer(typeEmbedding); + + encoder = New(graph, options); + registerLayer(encoder); + } + + // @TODO: subBatch should be of type Expr + virtual std::tuple apply(Ptr subBatch) const override { + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + const auto& [batchEmbeddings, batchMask] = embeddingLayer->apply(subBatch); + +#if 1 + auto typeEmbeddings = typeEmbedding->apply(subBatch); + auto embeddings = batchEmbeddings + typeEmbeddings; +#else + auto embeddings = batchEmbeddings; +#endif + + auto batchContext = encoder->apply(embeddings, batchMask); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return std::make_tuple(batchContext, batchMask); + } + + virtual Expr apply(const Words& words, const Shape& shape) const override final { + return applyIndices(toWordIndexVector(words), shape); + } + + // alternative from indices directly + virtual Expr applyIndices(const std::vector& wordIndices, const Shape& shape) const override final { + auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); + Expr batchEmbedding = embeddingLayer->applyIndices(wordIndices, shape); + auto batchContext = encoder->apply(batchEmbedding, /*mask=*/nullptr); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + return batchContext; + } + + // @TODO: currently here for backwards compat, should be replaced with apply() + virtual Ptr build(Ptr graph, + Ptr batch) override { +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + EncoderBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + // @TODO: this needs to convert to a BERT-batch + + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); + return New(batchEmbedding, batchMask, batch); + } + + virtual void clear() override { + Layer::clear(); + } +}; + +class BleurtPooler final : public nn::LayerWithOptions, + public PoolerBase { +private: + Ptr layers; + std::mt19937 rng{(uint32_t)Config::seed}; + +public: + BleurtPooler(Ptr graph, Ptr options) + : LayerWithOptions(graph, options), + PoolerBase(graph, options) { + + float dropoutProb = 0.f; + layers = New( + graph, + New(graph, LayerWithOptions::opt("transformer-dim-model")), // @TODO: get rid of amibuigity + New(graph), + New(graph, dropoutProb), + New(graph, 1) + ); + + registerLayer(layers); + } + + std::vector apply(Ptr graph, Ptr batch, const std::vector>& encoderStates) override { +#if 1 + // @TODO: this should be removed, currently hack to init graph. Should happen in graph groups and constructors + PoolerBase::graph_ = graph; + setGraph(graph); + // This makes sure that the graph passed into the model during construction and now evaluation are identical. + // A good check to have for catching weird situations early. + ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); +#endif + + auto modelType = LayerWithOptions::opt("type"); + + auto emb = slice(encoderStates[0]->getContext(), -2, 0); + emb = marian::cast(emb, Type::float32); + + Expr output; + if(LayerWithOptions::opt("usage") == (int)models::usage::evaluating) { + output = layers->apply(emb); + int dimBatch = output->shape()[-3]; + output = reshape(output, {dimBatch, 1, 1}); + return { output }; + } else { + ABORT("Usage other than evaluating not implemented"); + } + } + + void clear() override {} +}; + +} // namespace models +} // namespace marian + diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h index cca18cac7..658d754e1 100644 --- a/src/models/comet_qe.h +++ b/src/models/comet_qe.h @@ -26,9 +26,6 @@ struct CometEncoder final : public nn::TransformerEncoder { // apply positional embeddings to contextual input output = positionEmbedding->apply(output); - // handle for skip connection at top - auto prevOutput = output; - // apply dropout or layer-norm to embeddings if required output = preprocessor->apply(output); @@ -142,14 +139,34 @@ struct CometBatchEncoder final : public nn::LayerWithOptions, } }; -class CometQEPooler final : public nn::LayerWithOptions, - public PoolerBase { +// Dummpy pooler that only returns the encoder context +class CometEmbeddingPooler final : public nn::LayerWithOptions, + public PoolerBase { +public: + CometEmbeddingPooler(Ptr graph, Ptr options) + : LayerWithOptions(graph, options), + PoolerBase(graph, options) {} + + std::vector apply(Ptr graph, Ptr batch, const std::vector>& encoderStates) override { + auto usage = (models::usage)LayerWithOptions::opt("usage"); + ABORT_IF(usage != models::usage::embedding, "This pooler should only be used for generating embeddings??"); + ABORT_IF(encoderStates.size() != 1, "Size of encoderStates {} != 1", encoderStates.size()); + + return { encoderStates[0]->getContext() }; + } + + void clear() override {} +}; + +// Actual COMET-like pooler, works for COMET-QE and COMET models (prior to WMT22) +class CometMetricPooler final : public nn::LayerWithOptions, + public PoolerBase { private: Ptr layers; std::mt19937 rng{(uint32_t)Config::seed}; public: - CometQEPooler(Ptr graph, Ptr options) + CometMetricPooler(Ptr graph, Ptr options) : LayerWithOptions(graph, options), PoolerBase(graph, options) { @@ -221,49 +238,80 @@ class CometQEPooler final : public nn::LayerWithOptions, return {xMixup, yMixup}; }; - ABORT_IF(encoderStates.size() != 2, "Pooler expects exactly two encoder state"); - - auto src = encoderStates[0]->getContext(); - auto mt = encoderStates[1]->getContext(); + auto usage = (models::usage)LayerWithOptions::opt("usage"); + ABORT_IF(usage == models::usage::embedding, "Wrong pooler for embedding??"); + + auto modelType = LayerWithOptions::opt("type"); + ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe"); + ABORT_IF(modelType == "comet" && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet"); - auto diff = abs(mt - src); - auto prod = mt * src; - - Expr output; - if(LayerWithOptions::opt("usage") == (int)models::usage::embedding) { - auto embFwd = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] - auto embBwd = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model] - auto emb = concatenate({embFwd, embBwd}, /*axis=*/-2); - output = layers->apply(emb); - - int dimBatch = output->shape()[-3]; - output = reshape(output, {dimBatch, 1, 2}); - return { output }; - } else { - auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] + if(modelType == "comet-qe") { + auto src = encoderStates[0]->getContext(); + auto mt = encoderStates[1]->getContext(); - auto softLabelsWords = batch->front()->data(); - auto classVocab = batch->front()->vocab(); + auto diff = abs(mt - src); + auto prod = mt * src; + + Expr output; + if(usage == models::usage::evaluating) { + auto embFwd = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] + auto embBwd = concatenate({src, mt, prod, diff}, /*axis=*/-1); // [batch, 1, model] + auto emb = concatenate({embFwd, embBwd}, /*axis=*/-2); + output = layers->apply(emb); + + int dimBatch = output->shape()[-3]; + output = reshape(output, {dimBatch, 1, 2}); + return { output }; + } else { + auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] + + auto softLabelsWords = batch->front()->data(); + auto classVocab = batch->front()->vocab(); + + int dimBatch = (int)softLabelsWords.size(); + std::vector softLabels; + for(auto w : softLabelsWords) { + // @TODO: this is a super-ugly hack to get regression values + float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f; + softLabels.push_back(score); + } + auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32); + + if(getMode() == Mode::train) { + float mixupAlpha = LayerWithOptions::opt("comet-mixup", 0.f); + bool mixupReg = LayerWithOptions::opt("comet-mixup-reg", false); + auto xy = mixup(emb, labels, mixupAlpha, mixupReg); + emb = get<0>(xy); + labels = get<1>(xy); + } + output = marian::cast(layers->apply(emb), Type::float32); + return { output, labels }; + } + } else if(modelType == "comet") { + auto src = encoderStates[0]->getContext(); + auto mt = encoderStates[1]->getContext(); + auto ref = encoderStates[2]->getContext(); - int dimBatch = (int)softLabelsWords.size(); - std::vector softLabels; - for(auto w : softLabelsWords) { - // @TODO: this is a super-ugly hack to get regression values - float score = w != Word::NONE ? std::stof((*classVocab)[w]) : 0.f; - softLabels.push_back(score); - } - auto labels = graph->constant({dimBatch, 1, 1}, inits::fromVector(softLabels), Type::float32); - - if(getMode() == Mode::train) { - float mixupAlpha = LayerWithOptions::opt("comet-mixup", 0.f); - bool mixupReg = LayerWithOptions::opt("comet-mixup-reg", false); - auto xy = mixup(emb, labels, mixupAlpha, mixupReg); - emb = get<0>(xy); - labels = get<1>(xy); + auto diffRef = abs(mt - ref); + auto prodRef = mt * ref; + + auto diffSrc = abs(mt - src); + auto prodSrc = mt * src; + + Expr output; + if(usage == models::usage::evaluating) { + auto emb = concatenate({mt, ref, prodRef, diffRef, prodSrc, diffSrc}, /*axis=*/-1); // [batch, 1, model] + output = layers->apply(emb); + int dimBatch = output->shape()[-3]; + output = reshape(output, {dimBatch, 1, 1}); + return { output }; + } else { + // Currently no training for COMET with reference @TODO: add training + ABORT("Usage other than 'evaluating' not implemented"); } - output = marian::cast(layers->apply(emb), Type::float32); - return { output, labels }; - } + } else { + ABORT("Unknown model type {}", modelType); + } } void clear() override {} diff --git a/src/models/model_base.h b/src/models/model_base.h index 6a327968a..32705bbe7 100644 --- a/src/models/model_base.h +++ b/src/models/model_base.h @@ -9,8 +9,16 @@ namespace marian { namespace models { -enum struct usage { raw, training, scoring, translation, embedding }; -} +enum struct usage { + raw, + training, + scoring, + translation, + embedding, // used for laser and other models to produce embedding vectors + evaluating // evaluating is a special mode for neural metrics, different from (probabilistic) scoring +}; + +} // namespace models } // namespace marian YAML_REGISTER_TYPE(marian::models::usage, int) diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 40ba122a6..707a81ca9 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -17,6 +17,7 @@ #include "models/transformer_new.h" #include "models/comet_qe.h" +#include "models/bleurt.h" #ifdef CUDNN #include "models/char_s2s.h" @@ -133,40 +134,89 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti Ptr graph = nullptr; // graph unknown at this stage // clang-format off + if(type == "comet-qe" || type == "comet") { + if(type == "comet") { + ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type); + ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type); + } + + auto inputTypes = options->get>("input-types"); + ABORT_IF(inputTypes.empty(), + "Required option --input-types for COMET-QE not set. " + "For inference that should be --input-types sequence sequence. " + "For training set --input-types class sequence sequence"); + + int shift = 0; + if(inputTypes[0] == "class") + shift = 1; + + auto newOptions = options->with("usage", use); + auto res = New(newOptions); + + size_t numEncoders = 0; + bool addMetricPooler = false; + bool addEmbeddingPooler = false; + + switch(use) { + case usage::embedding: numEncoders = 1; addEmbeddingPooler = true; break; + case usage::evaluating: + case usage::scoring: + case usage::training: numEncoders = (type == "comet-qe") ? 2 : 3; addMetricPooler = true; break; + default: ABORT("Usage {} is not supported for model of type {}", (int)use, type); + } + + for(size_t i = 0; i < numEncoders; i++) { + auto enc = New(graph, newOptions->with("type", "transformer", "index", i + shift)); + enc->setName("CometEncoder"); // parameters will be shared + res->push_back(enc); + } + + if(addEmbeddingPooler) { + auto pooler = New(graph, newOptions); + pooler->setName("CometEmbeddingPooler"); + res->push_back(pooler); + } + + if(addMetricPooler) { + auto pooler = New(graph, newOptions); + pooler->setName("CometQEPooler"); // @TODO: change name for different models + res->push_back(pooler); + } + + return res; + } + + if(type == "bleurt") { + ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type); + + auto newOptions = options->with("usage", use); + auto res = New(newOptions); + + auto inputTypes = options->get>("input-types"); + ABORT_IF(inputTypes.empty(), + "Required option --input-types for BLEURT not set. " + "For inference that should be --input-types sequence. " + "For training set --input-types class sequence"); + + int shift = 0; + if(inputTypes[0] == "class") + shift = 1; + + auto enc = New(graph, newOptions->with("type", "transformer", "index", 0 + shift)); + enc->setName("BleurtEncoder"); + res->push_back(enc); + + auto pooler = New(graph, newOptions); + pooler->setName("BleurtPooler"); + res->push_back(pooler); + return res; + } + bool trainEmbedderRank = options->hasAndNotEmpty("train-embedder-rank"); if(use == usage::embedding || trainEmbedderRank) { // hijacking an EncoderDecoder model for embedding only - auto dimVocabs = options->get>("dim-vocabs"); size_t fields = trainEmbedderRank ? dimVocabs.size() : 0; int dimVocab = dimVocabs[0]; - - if(type == "comet-qe") { - auto newOptions = options->with("usage", use); - auto res = New(newOptions); - - auto inputTypes = options->get>("input-types"); - ABORT_IF(inputTypes.empty(), - "Required option --input-types for COMET-QE not set. " - "For inference that should be --input-types sequence sequence. " - "For training set --input-types class sequence sequence"); - - int shift = 0; - if(inputTypes[0] == "class") - shift = 1; - - auto enc1 = New(graph, newOptions->with("type", "transformer", "index", 0 + shift)); - enc1->setName("CometEncoder"); - res->push_back(enc1); - - auto enc2 = New(graph, newOptions->with("type", "transformer", "index", 1 + shift)); - enc2->setName("CometEncoder"); - res->push_back(enc2); - - auto pooler = New(graph, newOptions); - pooler->setName("CometQEPooler"); - res->push_back(pooler); - return res; - } Ptr newOptions; if(options->get("compute-similarity", false)) { @@ -207,28 +257,6 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti return res; } - if(use == usage::training || use == usage::scoring) { - if(type == "comet-qe") { - auto newOptions = options->with("usage", use); - auto res = New(newOptions); - - // For training, first rank in batch is class! - - auto enc1 = New(graph, newOptions->with("type", "transformer", "index", 1)); - enc1->setName("CometEncoder"); - res->push_back(enc1); - - auto enc2 = New(graph, newOptions->with("type", "transformer", "index", 2)); - enc2->setName("CometEncoder"); - res->push_back(enc2); - - auto pooler = New(graph, newOptions); - pooler->setName("CometQEPooler"); - res->push_back(pooler); - return res; - } - } - if(type == "s2s" || type == "amun" || type == "nematus") { return models::encoder_decoder(options->with( "usage", use, @@ -462,10 +490,10 @@ Ptr createModelFromOptions(Ptr options, usage use) { else ABORT("'usage' parameter 'translation' cannot be applied to model type: {}", type); } - else if (use == usage::raw || use == usage::embedding) + else if (use == usage::raw || use == usage::embedding || use == usage::evaluating) return baseModel; else - ABORT("'Usage' parameter must be 'translation' or 'raw'"); + ABORT("'Usage' parameter must be 'translation' or 'raw'"); // I am actually not sure what this is supposed to mean any more. } Ptr createCriterionFunctionFromOptions(Ptr options, usage use) { diff --git a/src/tensors/gpu/gpu_info.cpp b/src/tensors/gpu/gpu_info.cpp new file mode 100644 index 000000000..f6a59465f --- /dev/null +++ b/src/tensors/gpu/gpu_info.cpp @@ -0,0 +1,19 @@ +#include "common/definitions.h" + +#if CUDA_FOUND +#include "tensors/gpu/cuda_helpers.h" +#endif + +namespace marian { +namespace gpu { + size_t availableDevices() { +#if CUDA_FOUND + int deviceCount; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + return (size_t)deviceCount; +#else + return 0; +#endif + } +} +} \ No newline at end of file From d1d10a46bd34b5e5552b8c0ac91313cf0f829dcb Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 1 Jul 2023 08:37:29 +0000 Subject: [PATCH 239/254] Merged PR 30079: Fixes and extends unit test for layer norm Fixes and extends unit test for layer norm. Previous version had a weird usage of Glorot Uniform. --- CHANGELOG.md | 1 + VERSION | 2 +- src/tests/units/operator_tests.cpp | 74 ++++++++++++++++++++++-------- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a436308c7..0fb1dfd2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed unit test for LayerNorm - Only collect batch statistics during mini-batch-fit up to actual max-length. - Implemented fully correct version of GELU instead of using bad approximatin via Swish. - Handle copying from fp32 or fp16 embeddings in embedder mode correctly. diff --git a/VERSION b/VERSION index f15731572..893904681 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.6 +v1.12.7 diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index 236823fe4..34a0dd6f5 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -271,33 +271,69 @@ void tests(DeviceType device, Type floatType = Type::float32) { graph->clear(); values.clear(); -#ifdef CUDA_FOUND - std::vector vLn({ - -1.1962, 1.43061, 0.380288, -0.614697, 0.816638, 0.622649, - -1.69679, 0.257504, -1.12563, -0.151387, 1.61181, -0.334796, - 1.07207, -0.622614, 0.862014, -1.31147 - }); -#else - std::vector vLn({ - -1.49821, -0.152206, 0.394932, 1.25548, -1.51701, -0.28032, - 0.9483, 0.849025, 0.855183, 1.11657, -0.788354, -1.1834, - -0.85939, -1.13109, 0.972076, 1.01841 - }); -#endif + std::vector init = { + 2.88794374, 4.67853451, 3.96257305, 3.28433037, + 0.37778997, 0.67662024, 4.24959183, 1.23910618, + 0.68929380, 2.00369596, 4.38251686, 1.75624943, + 4.96126175, 3.01947117, 4.72057724, 2.23017120 + }; + + auto a1 = graph->param("test1", {2, 2, 4}, inits::fromVector(init)); + auto a2 = graph->param("test2", {2, 2, 4}, inits::fromVector(init)); - auto a = graph->constant({2, 2, 4}, inits::glorotUniform()); - auto gamma = graph->param("gamma", {1, 4}, inits::ones()); - auto beta = graph->param("beta", {1, 4}, inits::zeros()); - auto ln = layerNorm(a, gamma, beta); + std::vector gammaVec({0.1f, -0.2f, 0.3f, -0.4f}); + std::vector betaVec({-0.1f, 0.2f, -0.3f, 0.4f}); + + auto gamma1 = graph->param("gamma1", {4}, inits::fromVector(gammaVec)); + auto beta1 = graph->param("beta1", {4}, inits::fromVector(betaVec)); + + auto gamma2 = graph->param("gamma2", {4}, inits::fromVector(gammaVec)); + auto beta2 = graph->param("beta2", {4}, inits::fromVector(betaVec)); + + // layernorm via special operator + auto ln = layerNorm(a1, gamma1, beta1, 1e-5f); + + // layernorm via elementary operators + auto num = a2 - mean(a2, /*axis=*/-1); + auto den = sqrt(mean(square(num), /*axis=*/-1) + 1e-5f); + auto ln2 = gamma2 * (num / den) + beta2; + + auto top = sum(flatten(ln + ln2)); graph->forward(); + graph->backward(); CHECK(ln->shape() == Shape({2, 2, 4})); + std::vector values2; + + // compare values of ln and ln2 to make sure forward computation is correct ln->val()->get(values); + ln2->val()->get(values2); + CHECK( std::equal(values.begin(), values.end(), - vLn.begin(), floatApprox) ); + values2.begin(), floatApprox2) ); + // compare adjoints of a1 and a2 (parameters) to makes sure gradient computation is correct + a1->grad()->get(values); + a2->grad()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + values2.begin(), floatApprox2) ); + + // compare adjoints of gamma1 and gamma2 (parameters) to makes sure gradient computation is correct + gamma1->grad()->get(values); + gamma2->grad()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + values2.begin(), floatApprox2) ); + + // compare adjoints of beta1 and beta2 (parameters) to makes sure gradient computation is correct + beta1->grad()->get(values); + beta2->grad()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + values2.begin(), floatApprox2) ); } SECTION("RMS normalization") { @@ -313,7 +349,7 @@ void tests(DeviceType device, Type floatType = Type::float32) { auto a1 = graph->param("test1", {2, 2, 4}, inits::fromVector(init)); auto a2 = graph->param("test2", {2, 2, 4}, inits::fromVector(init)); - auto gamma = graph->param("gamma", {1, 4}, inits::ones()); + auto gamma = graph->param("gamma", {4}, inits::ones()); auto rms = rmsNorm(a1, gamma, nullptr, 1e-5f); auto rms2 = gamma * (a2 / sqrt(mean(a2 * a2, /*axis=*/-1) + 1e-5f)); From bd63ccec4ddb919dbbdb9f80f76165d663fcd20d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 3 Jul 2023 04:38:40 +0000 Subject: [PATCH 240/254] Merged PR 28078: Various small improvements Various small improvements, missing operators, missing gradient computations etc. The two most useful ones are probably: * Working backward step (gradient) for scatter operation * Possiblity to use LayerNorm and RMSNorm without scale and bias vectors (especially in new layer framework) --- src/common/config_parser.cpp | 5 + src/common/hash.h | 20 ++++ src/common/shape.h | 41 ++++---- src/graph/expression_graph.cpp | 2 +- src/graph/expression_operators.cpp | 69 +++++++++++--- src/graph/expression_operators.h | 37 +++++--- src/graph/node_operators_binary.h | 55 +++++++++-- src/graph/node_operators_tuple.h | 71 +++++++++++++- src/graph/node_operators_unary.h | 17 ++-- src/layers/embedding.cpp | 3 +- src/layers/embedding.h | 2 +- src/layers/generic.h | 4 +- src/layers_new/embeddings.h | 2 +- src/layers_new/neuralnet.h | 136 ++++++++++++++------------- src/layers_new/rnn.h | 2 +- src/layers_new/transformer.h | 22 ++++- src/models/encoder_decoder.cpp | 3 + src/models/transformer.h | 9 +- src/tensors/cpu/tensor_operators.cpp | 9 +- src/tensors/gpu/add.inc | 3 +- src/tensors/gpu/add_all.inc | 4 +- src/tensors/gpu/element.inc | 5 + src/tensors/gpu/tensor_operators.cu | 88 +++++++++-------- src/tensors/tensor_operators.h | 24 ++++- src/tests/units/operator_tests.cpp | 13 ++- src/training/graph_group.cpp | 2 +- 26 files changed, 459 insertions(+), 189 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 0d8021bf1..3b8d50edf 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -320,6 +320,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--transformer-depth-scaling", "Scale down weight initialization in transformer layers by 1 / sqrt(depth)"); + cli.add("--transformer-no-bias", + "Don't use any bias vectors in linear layers"); + cli.add("--transformer-no-affine", + "Don't use any scale or bias vectors in layer norm"); + cli.add("--bert-mask-symbol", "Masking symbol for BERT masked-LM training", "[MASK]"); cli.add("--bert-sep-symbol", "Sentence separator symbol for BERT next sentence prediction training", "[SEP]"); cli.add("--bert-class-symbol", "Class symbol BERT classifier training", "[CLS]"); diff --git a/src/common/hash.h b/src/common/hash.h index c2df2a63e..a05ffcfbc 100644 --- a/src/common/hash.h +++ b/src/common/hash.h @@ -24,5 +24,25 @@ inline HashType hashMem(const T* beg, size_t len, HashType seed = 0) { return seed; } +/** + * Base case for template recursion below (no arguments are hashed to 0) + */ +template +inline HashType hashArgs() { + return 0; +} + +/** + * Hash an arbitrary number of arguments of arbitrary type via template recursion + */ +template +inline HashType hashArgs(T arg, Args... args) { + // Hash arguments without first arg + HashType seed = hashArgs(args...); + // Hash first arg and combine which above hash + hash_combine(seed, arg); + return seed; +} + } } diff --git a/src/common/shape.h b/src/common/shape.h index 270b35376..ad2be866f 100644 --- a/src/common/shape.h +++ b/src/common/shape.h @@ -12,28 +12,20 @@ namespace marian { -class ShapeSizeException : public std::exception { -private: - char* message_; - +/** + * This exception gets thrown when the requested shape cannot be allocated due to numeric capacity limitations. +*/ +class ShapeSizeException : public std::runtime_error { public: - ShapeSizeException(size_t available, size_t asked) { - std::string mstr = "Expanded shape size " + std::to_string(asked) - + " exceeds numeric capcacity " + std::to_string(available); - - message_ = new char[mstr.size() + 1]; - std::copy(mstr.begin(), mstr.end(), message_); - message_[mstr.size()] = 0; - } - - ~ShapeSizeException() { delete[] message_; } - - virtual const char* what() const noexcept override { return message_; } + ShapeSizeException(size_t available, size_t asked) + : std::runtime_error(fmt::format("Expanded shape size {} exceeds numeric capcacity {}", asked, available)) + {} }; - -struct Slice // Python-like slice/index descriptor -{ +/** + * Python-like slice/index descriptor + */ +struct Slice { Slice(int b, int e, int s) : begin(b), end(e), stride(s) {} Slice(int b, int e) : Slice(b, e, 1) {} Slice() : Slice(0, END) {} @@ -46,6 +38,7 @@ struct Slice // Python-like slice/index descriptor /*const*/ int begin, end, stride; static const int END = INT_MAX; }; + typedef std::vector Slices; /** @@ -61,6 +54,8 @@ struct Shape { std::vector shape_; public: + typedef std::vector Axes; + Shape() : shape_({1}) {} Shape(std::initializer_list il) : Shape() { @@ -254,6 +249,14 @@ struct Shape { return shape; } + Shape fromAxes(const Axes& axes) const { + Shape subShape; + subShape.resize(size()); + for(Axes::value_type axis : axes) + subShape.set(axis, dim(axis)); + return subShape; + } + size_t hash() const { size_t seed = util::hash()(shape_[0]); for(size_t i = 1; i < shape_.size(); ++i) diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 9e90b5413..ce51b0f2b 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -156,7 +156,7 @@ void ExpressionGraph::forward(std::list& forwardTape, bool finalPass) { if(v->marked_for_debug()) { Logger log = spdlog::get("general"); if(log) { - LOG(info, "Debug: {} op={}", v->debug_message(), v->type()); + LOG(info, "Debug: {} op={} name={}", v->debug_message(), v->type(), v->name()); LOG(info, v->val()->debug()); } else { diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 0ec6f7e67..ad1a4ff19 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -159,6 +159,16 @@ Expr2 topk(Expr a, int k, int axis, bool descending) { return std::make_tuple(swapAxes(topkVal, axis, -1), swapAxes(topkIdx, axis, -1)); // non-op if axes are the same } +Expr topkIndices(Expr a, int k, int axis, bool descending) { + const auto& [values, indices] = topk(a, k, axis, descending); + return choose({values, indices}, 1); +} + +Expr topkValues(Expr a, int k, int axis, bool descending) { + const auto& [values, indices] = topk(a, k, axis, descending); + return choose({values, indices}, 0); +} + Expr2 argmax(Expr a, int axis) { return topk(a, 1, axis, /*descending=*/true); } @@ -353,10 +363,30 @@ Expr flatten_2d(Expr a) { } Expr stopGradient(Expr a) { +#if 0 + // This is a different implementation which is more reliable than the original, + // but it introduces a full copy which hogs memory. Keeping it around for now + // to decide later which one to use. + + auto fwd = [](Expr output, const std::vector inputs) { + CopyCast(output->val(), inputs[0]->val()); + }; + + auto bwd = [](Expr output, const std::vector inputs) { + /*Dummy*/ + }; + + return lambda({a}, a->shape(), a->value_type(), fwd, bwd, (size_t)&fwd); +#else // implemented as a dummy reshape that is not trainable auto res = Expression(a, a->shape()); res->setTrainable(false); return res; +#endif +} + +Expr choose(std::vector nodes, size_t index) { + return Expression(nodes, index); } // gather() -- gather arbitrary elements along an axis; batched or non-batched @@ -693,21 +723,28 @@ Expr affineWithReluDropout(Expr x, Expr W, Expr bias, float dropProb) { return Expression(x, W, bias); } else { Expr output = affine(x, W, bias); - int dimModel = output->shape()[-1]; - int dimTime = output->shape()[-2]; - output = dropoutReluInplace(output, dropProb, {dimTime, dimModel}); + output = dropoutReluInplace(output, dropProb, Shape::Axes({-2, -1})); return output; } } +Expr dropoutReluInplace(Expr x, Expr mask) { + return Expression(x, mask); +} + Expr dropoutReluInplace(Expr x, float dropProb, Shape shape) { - if(dropProb == 0) { - return relu(x); - } else { - auto graph = x->graph(); - auto mask = graph->dropoutMask(dropProb, shape); - return Expression(x, mask); - } + Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, shape) : nullptr; + return dropoutReluInplace(x, mask); +} + +Expr dropoutReluInplace(Expr x, float dropProb, const Shape::Axes& axes) { + Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape().fromAxes(axes)) : nullptr; + return dropoutReluInplace(x, mask); +} + +Expr dropoutReluInplace(Expr x, float dropProb) { + Expr mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape()) : nullptr; + return dropoutReluInplace(x, mask); } // @TODO: Not a great place to check this @@ -860,24 +897,28 @@ Expr square(Expr a) { } Expr layerNorm(Expr x, - Expr gamma, + Expr gamma/*= nullptr*/, Expr beta /*= nullptr*/, float eps /*= 1e-9*/) { // layerNorm accumulates in float, so small eps is fine - std::vector nodes = {x, gamma}; + std::vector nodes = {x}; + if(gamma) + nodes.push_back(gamma); if(beta) nodes.push_back(beta); return Expression(nodes, eps); } Expr rmsNorm(Expr x, - Expr gamma, + Expr gamma /*= nullptr*/, Expr beta /*= nullptr*/, float eps /*= 1e-9*/) { // layerNorm accumulates in float, so small eps is fine - std::vector nodes = {x, gamma}; + std::vector nodes = {x}; + if(gamma) + nodes.push_back(gamma); if(beta) nodes.push_back(beta); return Expression(nodes, eps); diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index faef5c29e..e96d8f7c9 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -386,6 +386,8 @@ Expr get(Expr2 tuple) { return std::get(tuple); } * @returns An ordered 2-tuple of Expressions */ Expr2 topk(Expr a, int k, int axis, bool descending = true); +Expr topkIndices(Expr a, int k, int axis, bool descending = true); +Expr topkValues(Expr a, int k, int axis, bool descending = true); /** * Returns largest elements of an expression along an axis. @@ -683,6 +685,13 @@ Expr flatten_2d(Expr a); */ Expr stopGradient(Expr a); +/** + * Return index-th node from nodes. This is a selector which add `nodes` into the computation graph + * and makes sure they do not end up unattached if not used due to some condition that computes `index` + * for only one of them. This is a no-op similar to `reshape`. +*/ +Expr choose(std::vector nodes, size_t index); + /** * Gathers elements along an axis. * @param a The input expression @@ -924,7 +933,7 @@ Expr weighted_average(Expr in, Expr weights, int ax = 0); * @f] * @see LayerNormalizationOp */ -Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); +Expr layerNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9); /** * Applies RMS normalization over the last dimension. @@ -936,7 +945,7 @@ Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); * @f] * @see RMSNormalizationOp */ -Expr rmsNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); +Expr rmsNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9); /** * Highway transformation. @@ -957,7 +966,7 @@ Expr highway(const std::string prefix, Expr x); * Performs dropout using a given mask. */ static inline Expr dropout(Expr x, Expr mask) { - if (mask) + if(mask) return x * mask; else return x; @@ -967,24 +976,30 @@ static inline Expr dropout(Expr x, Expr mask) { * Performs dropout with a given probably and explicit shape. */ static inline Expr dropout(Expr x, float dropProb, Shape shape) { - if(dropProb == 0) - return x; - auto graph = x->graph(); - auto mask = graph->dropoutMask(dropProb, shape); + auto mask = dropProb ? x->graph()->dropoutMask(dropProb, shape) : nullptr; return dropout(x, mask); } +/** + * Performs dropout with a given probably over explicit axes. + */ +static inline Expr dropout(Expr x, float dropProb, const Shape::Axes& axes) { + auto mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape().fromAxes(axes)) : nullptr; + return dropout(x, mask); +} /** * Performs dropout with a given probability. */ static inline Expr dropout(Expr x, float dropProb) { - if(dropProb == 0) - return x; - return dropout(x, dropProb, x->shape()); + auto mask = dropProb ? x->graph()->dropoutMask(dropProb, x->shape()) : nullptr; + return dropout(x, mask); } -Expr dropoutReluInplace(Expr x, float dropProb, Shape shape); +Expr dropoutReluInplace(Expr x, Expr mask=nullptr); +Expr dropoutReluInplace(Expr x, float dropProb, Shape maskShape); +Expr dropoutReluInplace(Expr x, float dropProb, const Shape::Axes& axes); +Expr dropoutReluInplace(Expr x, float dropProb); /** * Shifts the elements of an expression by a per-axis offset @p shift diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index d35ca6fff..29259f983 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1031,13 +1031,11 @@ struct GatherNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( - // @TODO: rename to gather - Select(val_, child(0)->val(), child(1)->val(), axis_))}; + Select(val_, child(0)->val(), child(1)->val(), axis_))}; } NodeOps backwardOps() override { return {NodeOp( - // @TODO: rename to scatter Insert(child(0)->grad(), adj_, /*indices=*/child(1)->val(), axis_))}; } @@ -1095,17 +1093,52 @@ struct ScatterNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( CopyCast(val_, child(0)->val()); // @TODO: use normal copy - Insert(val_, /*source=*/child(2)->val(), /*indices=*/child(1)->val(), axis_) + Insert(val_, /*source=*/child(2)->val(), /*indices*/child(1)->val(), axis_); )}; } NodeOps backwardOps() override { - ABORT("backward for ScatterNodeOp not yet implemented"); + auto backwardForVal = [this]() { + auto allocator = graph()->allocator(); + + // create temporary tensor of child(0)->grad().shape() == adj_.shape() + // copy adj_ to temporary + auto grad = child(0)->grad(); + auto tempGradMem = allocator->alloc(grad->memory()->size()); + Tensor tempGrad = TensorBase::New(tempGradMem, grad->shape(), grad->type(), grad->getBackend()); + CopyCast(tempGrad, adj_); + + // create temporary tensor of zeros of values.shape() and values type + auto source = child(2)->val(); + auto tempZeroMem = allocator->alloc(source->memory()->size()); + Tensor tempZero = TensorBase::New(tempZeroMem, source->shape(), source->type(), source->getBackend()); + tempZero->set(0); + + // insert tensor of zeros into temporary + Insert(tempGrad, /*source=*/tempZero, /*indices*/child(1)->val(), axis_); + + // add temporary do child(0)->grad() + Add(functional::_1, grad, tempGrad); + + // clear temporary memory + allocator->free(tempGradMem); + allocator->free(tempZeroMem); + }; + + return { + // val - add gradients every where else to gradient of "a" + NodeOp(backwardForVal()), + + NodeOp(/*no gradient*/[](){}), // indices + + // add gradients on indices to gradient of "source" + NodeOp(Select(/*source*/child(2)->grad(), adj_, /*indices=*/child(1)->val(), axis_)) + }; } Shape newShape(Expr a, int axis, Expr indices, Expr source) { ABORT_IF(axis != -1, "only last dimensions"); - // ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); or broadcast + ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); Shape shape = a->shape(); // @TODO: do proper checking @@ -1152,7 +1185,9 @@ struct ColsNodeOp : public NaryNodeOp { } NodeOps backwardOps() override { - return {NodeOp(PasteCols(child(0)->grad(), adj_, child(1)->val()))}; + return {NodeOp( + PasteCols(child(0)->grad(), adj_, child(1)->val()); + )}; } Shape newShape(Expr a, Expr indices) { @@ -1555,7 +1590,7 @@ struct LayerNormalizationOp : public NaryNodeOp { return {NodeOp( LayerNormalization(val_, child(0)->val(), - child(1)->val(), + (children_.size() >= 2) ? child(1)->val() : nullptr, (children_.size() == 3) ? child(2)->val() : nullptr, eps_))}; } @@ -1566,12 +1601,12 @@ struct LayerNormalizationOp : public NaryNodeOp { LayerNormalizationGrad( graph()->allocator(), child(0)->grad(), - child(1)->grad(), + (children_.size() >= 2) ? child(1)->grad() : nullptr, (children_.size() == 3) ? child(2)->grad() : nullptr, adj_, val_, child(0)->val(), - child(1)->val(), + (children_.size() >= 2) ? child(1)->val() : nullptr, (children_.size() == 3) ? child(2)->val() : nullptr, eps_))}; } diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h index 8acb1bc83..4444e2ef8 100644 --- a/src/graph/node_operators_tuple.h +++ b/src/graph/node_operators_tuple.h @@ -1,5 +1,6 @@ #pragma once +#include "graph/node_operators.h" #include "graph/node_operators_unary.h" namespace marian { @@ -133,7 +134,7 @@ struct TopKNodeOp : public UnaryNodeOp, } void backward() override { - Insert(/*out*/child(0)->grad(), adj_, val_, axis_); + Insert(/*out*/child(0)->grad(), adj_, tupleVal_, axis_); } const std::string type() override { return "topk"; } @@ -164,4 +165,72 @@ struct TopKNodeOp : public UnaryNodeOp, } }; +// This node attaches multiple children to a parent node and allows +// to select one of them via a given index. This is mostly used to avoid +// unattached nodes that might nevertheless get created based on some +// runtime criterion that is not fully clear during construction. +class ChooseNodeOp : public NaryNodeOp { +protected: + friend class SerializationHelpers; + Expr chosen_; + size_t index_; + +public: + ChooseNodeOp(std::vector nodes, size_t index) + : NaryNodeOp(nodes, nodes[index]->shape(), nodes[index]->value_type()), + chosen_(nodes[index]), index_(index) { + Node::destroy_ = false; + } + + ~ChooseNodeOp() {} + + void allocate() override {} + void free() override {} + + void forward() override {} + void backward() override {} + + void init_dependent() override { chosen_->init_dependent(); } + + void set_zero_adjoint() override { chosen_->set_zero_adjoint(); } + + Tensor& val() override { + auto childVal = chosen_->val(); + auto temp = TensorBase::New(childVal->memory(), shape(), childVal->type(), childVal->getBackend()); + val_.swap(temp); + return val_; + }; + + Tensor& grad() override { + auto childGrad = chosen_->grad(); + auto temp = TensorBase::New(childGrad->memory(), shape(), childGrad->type(), childGrad->getBackend()); + adj_.swap(temp); + return adj_; + }; + + const std::string type() override { return "choose"; } + + const std::string color() override { return "grey"; } + + virtual size_t hash() override { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, index_); + hash_ = seed; + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(index_ != cnode->index_) + return false; + return true; + } +}; + } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 4e78e7166..6189d3cc9 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -888,8 +888,6 @@ class ReshapeNodeOp : public UnaryNodeOp { } }; - - // @TODO: add version with access to backward step // This allows to attach a lambda function to any node during the execution. It is a non-operation otherwise // i.e. doesn't consume any memory or take any time to execute (it's a reshape onto itself) other than the @@ -934,25 +932,32 @@ class DropoutReluInplaceNodeOp : public ReshapeNodeOp { Expr mask_; public: - DropoutReluInplaceNodeOp(Expr node, Expr mask) + DropoutReluInplaceNodeOp(Expr node, Expr mask = nullptr) : ReshapeNodeOp(node, node->shape()), mask_(mask) {} void forward() override { using namespace marian::functional; - Element(_1 = ReLU(_1 * _2), val(), mask_->val()); + if(mask_) + Element(_1 = ReLU(_1 * _2), val(), mask_->val()); + else + Element(_1 = ReLU(_1), val()); } void backward() override { using namespace marian::functional; - Element(_1 = _1 * ReLUback(_2) * _3, grad(), val(), mask_->val()); + if(mask_) + Element(_1 = _1 * ReLUback(_2) * _3, grad(), val(), mask_->val()); + else + Element(_1 = _1 * ReLUback(_2), grad(), val()); } const std::string type() override { return "dropoutReluInplace"; } virtual size_t hash() override { size_t seed = ReshapeNodeOp::hash(); - util::hash_combine(seed, mask_->hash()); + if(mask_) + util::hash_combine(seed, mask_->hash()); return seed; } diff --git a/src/layers/embedding.cpp b/src/layers/embedding.cpp index 93c6d9b33..377a4010a 100644 --- a/src/layers/embedding.cpp +++ b/src/layers/embedding.cpp @@ -169,8 +169,7 @@ Expr Embedding::applyIndices(const std::vector& embIdx, const Shape& // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape() // (test that separately) if(!inference_) - selectedEmbs = dropout( - selectedEmbs, options_->get("dropout", 0.0f), {selectedEmbs->shape()[-3], 1, 1}); + selectedEmbs = dropout(selectedEmbs, options_->get("dropout", 0.0f), Shape::Axes({-3})); return selectedEmbs; } diff --git a/src/layers/embedding.h b/src/layers/embedding.h index af22b980a..6895c4ab8 100644 --- a/src/layers/embedding.h +++ b/src/layers/embedding.h @@ -170,7 +170,7 @@ class ULREmbedding : public LayerBase, public IEmbeddingLayer { if(!inference_) batchEmbeddings = dropout(batchEmbeddings, options_->get("dropout-embeddings", 0.0f), - {batchEmbeddings->shape()[-3], 1, 1}); + Shape::Axes({-3})); return std::make_tuple(batchEmbeddings, batchMask); } diff --git a/src/layers/generic.h b/src/layers/generic.h index df11a2337..bd80a09ea 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -239,9 +239,7 @@ static inline Expr denseInline(Expr x, x = affine(x, W, b); x = activationByName(actName)(x); - int dimModel = x->shape()[-1]; - int dimTime = x->shape()[-2]; - x = dropout(x, dropProb, {dimTime, dimModel}); + x = dropout(x, dropProb, Shape::Axes({-2, -1})); } return x; diff --git a/src/layers_new/embeddings.h b/src/layers_new/embeddings.h index e080906fe..bbe971d1b 100644 --- a/src/layers_new/embeddings.h +++ b/src/layers_new/embeddings.h @@ -113,7 +113,7 @@ class Embedding : public LayerWithOptions, public IEmbeddingLayer { auto selectedEmbs = rows(embeddings, embIdx); // [(B*W) x E] selectedEmbs = reshape(selectedEmbs, shape); // [W, B, E] // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape() (test that separately) - selectedEmbs = dropout(selectedEmbs, opt("dropout", 0.0f), { selectedEmbs->shape()[-3], 1, 1 }); + selectedEmbs = dropout(selectedEmbs, opt("dropout", 0.0f), Shape::Axes({-3})); // @TODO: dropout here seems wrong! return selectedEmbs; } diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index 278758a96..33c089624 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -146,28 +146,26 @@ struct Linear : public Layer, public IUnaryLayer { }; struct Dropout final : public Layer, public IUnaryLayer { - float dropoutProbabilty; - UPtr dropoutMaskShape; + float dropoutProbability; + Shape::Axes dropoutAxes{{-2, -1}}; Dropout(Ptr graph, - float dropoutProbabilty, - const Shape& dropoutMaskShape) - : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(new Shape(dropoutMaskShape)) + float dropoutProbability, + const Shape::Axes& dropoutAxes) + : Layer(graph), dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {} Dropout(Ptr graph, - float dropoutProbabilty) - : Layer(graph), dropoutProbabilty(dropoutProbabilty), dropoutMaskShape(nullptr) + float dropoutProbability) + : Layer(graph), dropoutProbability(dropoutProbability) {} Expr apply(Expr input) const override { if(getMode() == Mode::eval) return input; - if(dropoutMaskShape && dropoutProbabilty > 0.f) { - return marian::dropout(input, dropoutProbabilty, *dropoutMaskShape); - } else if(dropoutProbabilty > 0.f) { - return marian::dropout(input, dropoutProbabilty, {input->shape()[-2], input->shape()[-1]}); + if(dropoutProbability > 0.f) { + return marian::dropout(input, dropoutProbability, dropoutAxes); } else { return input; } @@ -185,30 +183,29 @@ struct LinearReluDropout final : public Linear { using Linear::transposed; using Linear::init; - float dropoutProbabilty; - UPtr dropoutMaskShape; + float dropoutProbability; + Shape::Axes dropoutAxes{{-2, -1}}; // Typical constructor that can take an initializer function LinearReluDropout(Ptr graph, int dimOut, - float dropoutProbabilty, + float dropoutProbability, bool useBias = true, bool transposed = false, Ptr init = inits::glorotUniform()) : Linear(graph, dimOut, useBias, transposed, init), - dropoutProbabilty(dropoutProbabilty), - dropoutMaskShape(nullptr) {} + dropoutProbability(dropoutProbability) {} + // Typical constructor that can take an initializer function LinearReluDropout(Ptr graph, int dimOut, - float dropoutProbabilty, - const Shape& dropoutMaskShape, + float dropoutProbability, + const Shape::Axes& dropoutAxes, bool useBias = true, bool transposed = false, Ptr init = inits::glorotUniform()) : Linear(graph, dimOut, useBias, transposed, init), - dropoutProbabilty(dropoutProbabilty), - dropoutMaskShape(new Shape(dropoutMaskShape)) {} + dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {} Expr apply(Expr x) const override { int dimIn = x->shape()[-1]; @@ -224,83 +221,94 @@ struct LinearReluDropout final : public Linear { registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); } - // @TODO: handle relu inplace for inference etc. Expr output; if(useBias) output = marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed); else output = marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed); - if(getMode() == Mode::eval) - return relu(output); - - if(dropoutMaskShape && dropoutProbabilty > 0.f) { - return marian::dropoutReluInplace(output, dropoutProbabilty, *dropoutMaskShape); - } else if(dropoutProbabilty > 0.f) { - return marian::dropoutReluInplace(output, dropoutProbabilty, {output->shape()[-2], output->shape()[-1]}); + if(getMode() == Mode::eval) { + return marian::dropoutReluInplace(output); // no dropout } else { - return relu(output); + return marian::dropoutReluInplace(output, dropoutProbability, dropoutAxes); } } virtual void clear() override {} }; - struct Norm : public Layer, public IUnaryLayer { - Norm(Ptr graph) : Layer(graph) {} - virtual ~Norm() = default; + Expr scale{nullptr}; + Expr bias{nullptr}; + + bool useScale{true}; + bool useBias{true}; + bool elementwise{true}; + float eps{1e-5f}; - Expr apply(Expr x) const override = 0; -}; + Norm(Ptr graph, + bool useScale = true, + bool useBias = true, + bool elementwise = true, + float eps = 1e-5f) + : Layer(graph), + useScale(useScale), + useBias(useBias), + elementwise(elementwise), + eps(eps) {} + + virtual Expr getScale(int dimModel) const { + Expr scaleVector = nullptr; + if(useScale) { + registerParameterLazy(scale, Shape({ elementwise ? dimModel : 1 }), inits::ones()); + // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster + scaleVector = elementwise ? scale : scale * graph()->ones({dimModel}); // @TODO: make this obsolete + } + return scaleVector; + } -struct LayerNorm final : public Norm { - Expr weight; - Expr bias; + virtual Expr getBias(int dimModel) const { + Expr biasVector = nullptr; + if(useBias) { + registerParameterLazy(bias, Shape({ elementwise ? dimModel : 1 }), inits::zeros()); + // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster + biasVector = elementwise ? bias : bias * graph()->ones({dimModel}); // @TODO: make this obsolete + } + return biasVector; + } - float eps{1e-5f}; - bool elementwiseAffine{true}; + Expr apply(Expr x) const override = 0; +}; +struct LayerNorm : public Norm { LayerNorm(Ptr graph, - float eps = 1e-5f, - bool elementwiseAffine = true) - : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) + bool useScale = true, + bool useBias = true, + bool elementwise = true, + float eps = 1e-5f) + : Norm(graph, useScale, useBias, elementwise, eps) {} Expr apply(Expr x) const override { int dimModel = x->shape()[-1]; - if(elementwiseAffine) { - registerParameterLazy(weight, Shape({ dimModel }), inits::ones()); - registerParameterLazy(bias, Shape({ dimModel }), inits::zeros()); - return marian::layerNorm(x, weight, bias, eps); - } else { - return marian::layerNorm(x, nullptr, nullptr, eps); - } + return marian::layerNorm(x, getScale(dimModel), getBias(dimModel), eps); } virtual void clear() override {} }; -struct RMSNorm final : public Norm { - Expr weight; - - float eps{1e-5f}; - bool elementwiseAffine{true}; - +struct RMSNorm : public Norm { RMSNorm(Ptr graph, - float eps = 1e-5f, - bool elementwiseAffine = true) - : Norm(graph), eps(eps), elementwiseAffine(elementwiseAffine) + bool useScale = true, + bool useBias = true, + bool elementwise = true, + float eps = 1e-5f) + : Norm(graph, useScale, useBias, elementwise, eps) {} Expr apply(Expr x) const override { int dimModel = x->shape()[-1]; - if(elementwiseAffine) { - registerParameterLazy(weight, Shape({ dimModel }), inits::ones()); - return marian::rmsNorm(x, weight, nullptr, eps); - } else { - return marian::rmsNorm(x, nullptr, nullptr, eps); - } + return marian::rmsNorm(x, getScale(dimModel), getBias(dimModel), eps); } }; diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h index da3ac4f94..281d2dce9 100644 --- a/src/layers_new/rnn.h +++ b/src/layers_new/rnn.h @@ -31,7 +31,7 @@ class SSRU final : public Layer, public ICell { registerLayer(iProj); fProj = New(graph, dimState); registerLayer(fProj); - dropout = New(graph, dropProb, Shape({dimState})); + dropout = New(graph, dropProb, Shape::Axes({-1})); registerLayer(dropout); } diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index e808694de..ade61a78e 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -239,7 +239,17 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { if(opt("transformer-depth-scaling", false)) for(auto linear : transformerEncoderLayer->allLayers()) linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); - + + if(opt("transformer-no-bias", false)) + for(auto linear : transformerEncoderLayer->allLayers()) + linear->useBias = false; + + if(opt("transformer-no-affine", false)) { + for(auto norm : transformerEncoderLayer->allLayers()) { + norm->useScale = false; + norm->useBias = false; + } + } layers->append(transformerEncoderLayer); } @@ -491,7 +501,17 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); for(auto linear : currentLayer->filterBlock->allLayers()) linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + } + if(opt("transformer-no-bias", false)) + for(auto linear : currentLayer->allLayers()) + linear->useBias = false; + + if(opt("transformer-no-affine", false)) { + for(auto norm : currentLayer->allLayers()) { + norm->useScale = false; + norm->useBias = false; + } } } diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 6a298ed0d..f70353a64 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -68,6 +68,9 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("lemma-dependency"); modelFeatures_.insert("factors-combine"); modelFeatures_.insert("factors-dim-emb"); + + modelFeatures_.insert("transformer-no-bias"); + modelFeatures_.insert("transformer-no-affine"); } std::vector>& EncoderDecoder::getEncoders() { diff --git a/src/models/transformer.h b/src/models/transformer.h index a3f6d9b53..0fa52ff82 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -170,11 +170,8 @@ class Transformer : public EncoderOrDecoderBase { auto output = input; for(auto op : ops) { // dropout - if (op == 'd') { - int dimModel = output->shape()[-1]; - int dimTime = output->shape()[-2]; - output = dropout(output, dropProb, {dimTime, dimModel}); - } + if (op == 'd') + output = dropout(output, dropProb, Shape::Axes({-2, -1})); // layer normalization else if (op == 'n') output = layerNorm(output, prefix, "_pre"); @@ -191,7 +188,7 @@ class Transformer : public EncoderOrDecoderBase { for(auto op : ops) { // dropout if(op == 'd') - output = dropout(output, dropProb); + output = dropout(output, dropProb, Shape::Axes({-2, -1})); // skip connection else if(op == 'a') output = output + prevInput; diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 5be3eee26..6a075e9c5 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -710,6 +710,7 @@ void SelectAxis2(Tensor out, } #endif +template void Select(Tensor out, const Tensor in, const Tensor indices, @@ -736,10 +737,16 @@ void Select(Tensor out, int idxIndex = idxShape.bindex(dims); // return global index for indices based on dimension-specific indices from out, take broadcasting into account; dims[axisCPU] = (int)indices->data()[idxIndex]; // substitute index of out-tensor with corresponding axis-local position from in-tensor; int inIndex = inShape.index(dims); // compute global index from dimension-specific indices, no broadcasting as out and in match in all dimensions apart from axis - out->data()[index] = in->data()[inIndex]; // assign corresponding values. + if(add) + out->data()[index] += in->data()[inIndex]; // add for gradients. + else + out->data()[index] = in->data()[inIndex]; // assign corresponding values. } } +template void Select(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Select(Tensor out, const Tensor in, const Tensor indices, int axis); + template void Insert(Tensor out, const Tensor in, diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index 1b233bb1b..ed1e72553 100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -39,4 +39,5 @@ template void marian::gpu::Aggregate,marian::functional::UnaryFunctor > >,class IntrusivePtr,class IntrusivePtr >(marian::functional::BinaryFunctor,marian::functional::UnaryFunctor > >,float,class IntrusivePtr,class IntrusivePtr,class IntrusivePtr); template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); \ No newline at end of file +template void marian::gpu::Add >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Add, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc index b983b7b7e..41da1351b 100644 --- a/src/tensors/gpu/add_all.inc +++ b/src/tensors/gpu/add_all.inc @@ -1,4 +1,4 @@ -// see element.inc for instructions on how to maintain this + // see element.inc for instructions on how to maintain this using namespace functional; template void AggregateAll>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -41,6 +41,7 @@ template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); #if COMPILE_FP16 @@ -84,5 +85,6 @@ template void marian::AggregateAll<__half, float, marian::functional::UnaryFunct template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); #endif diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index 730817849..27cc641da 100755 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -73,6 +73,8 @@ template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor, marian::functional::Capture> >, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::UnaryFunctor, marian::functional::Assignee<2> > > >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::Assignee<3> > >, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr, IntrusivePtr); // How to add new specializations: @@ -82,3 +84,6 @@ template void marian::gpu::Element' with 'marian::Tensor' + +template void marian::gpu::Element, marian::functional::UnaryFunctor > >>(marian::functional::Assign, marian::functional::UnaryFunctor > >, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr, IntrusivePtr); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 508e1e3e7..5f8c4c122 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1304,7 +1304,7 @@ void PasteCols(Tensor out, } } -template +template __global__ void gSelect(T* out, functional::Shape outShape, const T* in, @@ -1322,7 +1322,10 @@ __global__ void gSelect(T* out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int inIndex = inShape.index(dims); - out[index] = in[inIndex]; + if(add) + out[index] += in[inIndex]; + else + out[index] = in[inIndex]; } } } @@ -1353,6 +1356,7 @@ __global__ void gInsert(T* out, } } +template void Select(Tensor out, const Tensor in, const Tensor indices, @@ -1369,36 +1373,39 @@ void Select(Tensor out, int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { - gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gSelect<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { - gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gSelect<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else if(out->type() == Type::uint32) { - gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gSelect<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); } else { ABORT("Select not implemented for type {}", out->type()); } } +template void Select(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Select(Tensor out, const Tensor in, const Tensor indices, int axis); + template void Insert(Tensor out, const Tensor in, @@ -2152,7 +2159,7 @@ __global__ void gLNormalization(T* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType xv = (AccType)xRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType lv = (xv - mean) / sigma; @@ -2182,7 +2189,7 @@ void LayerNormalization(Tensor out, if(out->type() == Type::float32) { gLNormalization<<>>(out->data(), in->data(), - gamma->data(), + gamma ? gamma->data() : nullptr, beta ? beta->data() : nullptr, rows, cols, @@ -2191,7 +2198,7 @@ void LayerNormalization(Tensor out, } else if (out->type() == Type::float16) { gLNormalization<<>>(out->data(), in->data(), - gamma->data(), + gamma ? gamma->data() : nullptr, beta ? beta->data() : nullptr, rows, cols, @@ -2241,7 +2248,7 @@ __global__ void gLayerNormalizationGrad(T* gradX, AccType xv = xRow[id]; AccType yv = yRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType adjv = adjRow[id]; AccType lv = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation @@ -2297,7 +2304,7 @@ __global__ void gLayerNormalizationGrad(T* gradX, if(id < cols) { AccType xv = xRow[id]; - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType adjv = adjRow[id]; AccType lv = (xv - mean) / sigma; @@ -2318,10 +2325,12 @@ __global__ void gLayerNormalizationGrad(T* gradX, T* gradXRow = gradX + j * cols; gradXRow[id] += (T)(gradXv); - T* gradGammaRow = gradGamma + j * cols; - // assignment is correct here as this gets summed up - // in the next kernel via matrix product - gradGammaRow[id] = (T)(adjv * lv); + if(gamma) { + T* gradGammaRow = gradGamma + j * cols; + // assignment is correct here as this gets summed up + // in the next kernel via matrix product + gradGammaRow[id] = (T)(adjv * lv); + } } } } @@ -2358,12 +2367,12 @@ void LayerNormalizationGrad(Ptr allocator, int shared = sizeof(float) * threads * 4; gLayerNormalizationGrad<<>>( gradX->data(), - tempGradGamma->data(), + gamma ? tempGradGamma->data() : nullptr, adj->data(), y->data(), x->data(), - gamma->data(), - (beta) ? beta->data() : nullptr, + gamma ? gamma->data() : nullptr, + beta ? beta->data() : nullptr, rows, cols, eps); @@ -2373,12 +2382,12 @@ void LayerNormalizationGrad(Ptr allocator, int shared = sizeof(float) * threads * 4; gLayerNormalizationGrad<<>>( gradX->data(), - tempGradGamma->data(), + gamma ? tempGradGamma->data() : nullptr, adj->data(), y->data(), x->data(), - gamma->data(), - (beta) ? beta->data() : nullptr, + gamma ? gamma->data() : nullptr, + beta ? beta->data() : nullptr, rows, cols, eps); @@ -2392,7 +2401,8 @@ void LayerNormalizationGrad(Ptr allocator, // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases - gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add + if(gradGamma) + gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction) gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 31bd1e14f..2747a6d66 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -301,8 +301,6 @@ DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor) -DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int) - #ifdef CUDA_FOUND namespace gpu { template @@ -325,6 +323,28 @@ static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int cpu::Insert(out, in, indices, axis); } +#ifdef CUDA_FOUND +namespace gpu { + template + void Select(Tensor out, const Tensor in, const Tensor indices, int axis); +} +#endif + +namespace cpu { + template + void Select(Tensor out, const Tensor in, const Tensor indices, int axis); +} + +template +static inline void Select(Tensor out, const Tensor in, const Tensor indices, int axis) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Select(out, in, indices, axis); + else +#endif + cpu::Select(out, in, indices, axis); +} + DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); DISPATCH2(LSTMCellForward, marian::Tensor, std::vector) diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index 34a0dd6f5..5806e94de 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -631,8 +631,15 @@ void tests(DeviceType device, Type floatType = Type::float32) { auto aff1 = affine(A, B, bias); auto aff2 = dot(A, B) + bias; - auto affRelu1 = affineWithReluDropout(A, B, bias); - auto affRelu2 = relu(dot(A, B) + bias); + auto A2 = graph->param("A2", {4, 3}, inits::fromVector(vA)); + auto B2 = graph->param("B2", {3, 2}, inits::fromVector(vB)); + + // @TODO: using this operator here is currently dangerous since the inplace + // operator inside might modify values in-place if the same operation is executed + // twice on the same inputs. (Hence the new parameters A2 and B2 here) + // This needs to be fixed in the future. + auto affRelu1 = affineWithReluDropout(A2, B2, bias); + auto affRelu2 = relu(dot(A2, B2) + bias); graph->forward(); @@ -643,7 +650,7 @@ void tests(DeviceType device, Type floatType = Type::float32) { values2.clear(); CHECK(aff2->shape() == aff1->shape()); aff2->val()->get(values2); - CHECK(values2 == values); + CHECK(values == values2); affRelu1->val()->get(values); affRelu2->val()->get(values2); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 367e47e16..43adddcac 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -638,7 +638,7 @@ Ptr GraphGroup::collectStats(Ptr graph, auto loss = model->build(graph, batch); fits = graph->fits(); } catch(const ShapeSizeException& e) { - LOG(debug, "Exception for maxBatch size {}: {}", maxBatch, e.what()); + LOG(debug, "Exception for maxBatch size {}: {}", current, e.what()); fits = false; } From a5b50f2ddc54759e65bd8616781eba43cc886973 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sun, 16 Jul 2023 23:23:38 +0000 Subject: [PATCH 241/254] Merged PR 30282: Fix parameter name for norms in new layer framework. Undoes the accidental renaming of the scale parameter in Norms layer back to "weight". --- CHANGELOG.md | 1 + VERSION | 2 +- src/layers_new/neuralnet.h | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fb1dfd2d..a40214ad5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed wrong paramter name for norm in new layer framework - Fixed unit test for LayerNorm - Only collect batch statistics during mini-batch-fit up to actual max-length. - Implemented fully correct version of GELU instead of using bad approximatin via Swish. diff --git a/VERSION b/VERSION index 893904681..d9d998341 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.7 +v1.12.8 diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index 33c089624..b81728c77 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -238,7 +238,7 @@ struct LinearReluDropout final : public Linear { }; struct Norm : public Layer, public IUnaryLayer { - Expr scale{nullptr}; + Expr weight{nullptr}; // = scale Expr bias{nullptr}; bool useScale{true}; @@ -260,9 +260,9 @@ struct Norm : public Layer, public IUnaryLayer { virtual Expr getScale(int dimModel) const { Expr scaleVector = nullptr; if(useScale) { - registerParameterLazy(scale, Shape({ elementwise ? dimModel : 1 }), inits::ones()); + registerParameterLazy(weight, Shape({ elementwise ? dimModel : 1 }), inits::ones()); // if elementwise==false we multiply with a vector of 1s - that's a trick to make gradient computation faster - scaleVector = elementwise ? scale : scale * graph()->ones({dimModel}); // @TODO: make this obsolete + scaleVector = elementwise ? weight : weight * graph()->ones({dimModel}); // @TODO: make this obsolete } return scaleVector; } From c8f1e03c0a7c80bf1578f90756c885db224c7982 Mon Sep 17 00:00:00 2001 From: Varun Mathur Date: Mon, 17 Jul 2023 12:11:56 +0000 Subject: [PATCH 242/254] Merged PR 30198: [quicksand] cache YAML configs Reusing these YAML configs helps speed up coreleaf loading. The only consumers of this quicksand API are the leaf, and I think this small memory tradeoff of keeping these in cache is worth the speedup. Related work items: #146810 --- src/microsoft/quicksand.cpp | 40 ++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 316c66d11..2302819eb 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -1,5 +1,7 @@ #include "quicksand.h" #include "marian.h" +#include +#include #if MKL_FOUND #include "mkl.h" @@ -60,6 +62,8 @@ class BeamSearchDecoder : public IBeamSearchDecoder { std::vector> vocabs_; + static inline std::unordered_map configCache_; + static inline std::mutex configCacheMutex_; public: BeamSearchDecoder(Ptr options, const std::vector& ptrs, @@ -87,16 +91,27 @@ class BeamSearchDecoder : public IBeamSearchDecoder { for(int i = 0; i < models.size(); ++i) { Ptr modelOpts = New(); + // serializing this YAML can be costly, so read from cache YAML::Node config; - if(io::isBin(models[i]) && ptrs_[i] != nullptr) - io::getYamlFromModel(config, "special:model.yml", ptrs_[i]); - else - io::getYamlFromModel(config, "special:model.yml", models[i]); + auto cachedConfig = getConfigFromCache(models[i]); + if(cachedConfig != nullptr) { + config = *cachedConfig; + } else { + if(io::isBin(models[i]) && ptrs_[i] != nullptr) + io::getYamlFromModel(config, "special:model.yml", ptrs_[i]); + else + io::getYamlFromModel(config, "special:model.yml", models[i]); + writeConfigToCache(config, models[i]); + } modelOpts->merge(options_); modelOpts->merge(config); - std::cerr << modelOpts->asYamlString() << std::flush; // @TODO: take a look at why this is even here. + // serializing this to YAML is expensive. we only want to do this once + // we can use whether we loaded the cache from config as a signal + if(cachedConfig == nullptr){ + std::cerr << modelOpts->asYamlString() << std::flush; + } auto encdec = models::createModelFromOptions(modelOpts, models::usage::translation); @@ -119,6 +134,21 @@ class BeamSearchDecoder : public IBeamSearchDecoder { graph_->forward(); } + YAML::Node* getConfigFromCache(std::string key){ + const std::lock_guard lock(configCacheMutex_); + bool inCache = configCache_.find(key) != configCache_.end(); + if (inCache) { + return &configCache_[key]; + } else { + // return null if no cache hit + return nullptr; + } + } + void writeConfigToCache(YAML::Node config, std::string key) { + const std::lock_guard lock(configCacheMutex_); + configCache_[key] = config; + } + void setWorkspace(uint8_t* data, size_t size) override { device_->set(data, size); } QSNBestBatch decode(const QSBatch& qsBatch, From c83d47f1df77c7ad51fc2bacaf903d688a6c9425 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 22 Jul 2023 05:00:42 +0000 Subject: [PATCH 243/254] Merged PR 30283: Save full checkpoints at saving intervals (with iteration number) when requested. This PR adds the option `--overwrite-checkpoints` (by default true to mimic current behavior) which can be set to `false` to force full checkpoint saving and preservation at saving intervals. E.g. for a model named `rus.enu.generalnn.replica_1.model.iter37769.npz`, Marian will then also save `rus.enu.generalnn.replica_1.model.iter37769.npz.optimizer.npz` and `rus.enu.generalnn.replica_1.model.iter37769.npz.progress.yml`. --- CHANGELOG.md | 3 +- VERSION | 2 +- src/common/config_parser.cpp | 5 +++ src/training/graph_group.cpp | 78 +++++++++++++++++++----------------- src/training/graph_group.h | 14 +++++-- 5 files changed, 60 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a40214ad5..79dd3f673 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers. - Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts. - `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20 - A bunch of scripts for metrics use and early MBR experiments - LSH vocab filtering for GPU. Speed is not competitive with non-LSH. Checking in for completeness and possible future use of LSH on GPU for non-filtering stuff -- Add --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) +- Added --throw-on-divergence and --fp16-fallback-to-fp32 options to detect (fp16 and fp32) and recover (only fp16) diverged runs. If not recoverable, exception gets rethrown and goes unhandled to force fatal error and shutdown. - Re-implementation of COMET-QE for inference and training; conversion scripts from Unbabel-Comet to Marian. - Validator that generates embeddings and can be used during COMET training with an external script. diff --git a/VERSION b/VERSION index d9d998341..2fc612cb1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.8 +v1.12.9 \ No newline at end of file diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 3b8d50edf..9b36338c1 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -388,6 +388,11 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add("--overwrite", "Do not create model checkpoints, only overwrite main model file with last checkpoint. " "Reduces disk usage"); + cli.add("--overwrite-checkpoint", + "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). " + "Setting --overwrite-checkpoint=false also saves full checkpoints checkpoints with optimizer parameters, etc. " + "Uses (a lot) more disk space.", + true); cli.add("--no-reload", "Do not load existing model specified in --model arg"); cli.add>("--train-sets,-t", diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 43adddcac..054b0ae76 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -358,19 +358,19 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { scheduler_->load(modelFileName); // we just load it N times from disk (it'll be in disk cache after the first) - // this also allocates memory correctly when calling forward() inside restoreFromCheckPoint + // this also allocates memory correctly when calling forward() inside restoreOptimizerState size_t i = 0; for(auto graph : graphs_) models_[i++]->load(graph, items, markReloaded); // try to restore everything from checkpoint now - restoreFromCheckpoint(modelFileName, scatterFn); + loadOptimizerState(modelFileName, scatterFn); } } } -bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName, - const OptimizerBase::ScatterStateFunc& scatterFn) { +bool GraphGroup::loadOptimizerState(const std::string& modelFileName, + const OptimizerBase::ScatterStateFunc& scatterFn) { /* if model checkpoint is available: - load model from checkpoint, not from model.npz @@ -436,8 +436,8 @@ bool GraphGroup::restoreFromCheckpoint(const std::string& modelFileName, return true; // succeeded to restore } -void GraphGroup::saveCheckpoint(const std::string& modelFileName, - const OptimizerBase::GatherStateFunc& gatherFn) { +void GraphGroup::saveOptimizerState(const std::string& modelFileName, + const OptimizerBase::GatherStateFunc& gatherFn) { // @TODO: change to .checkpoint.npz, would break backwards compat std::string checkpointName = modelFileName + ".optimizer.npz"; @@ -467,50 +467,56 @@ void GraphGroup::saveCheckpoint(const std::string& modelFileName, } } -void GraphGroup::save(bool isFinal, - const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) { +void GraphGroup::saveCheckPoint(const std::string& modelFileName, + bool isFinal, + bool doSaveOptimizerState, + const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) { barrier(); // (for better grouping of log messages) - // bring the smoothed model in // Note that it is sharded. For multi-node, it is sharded over multiple machines, so this is a network access. // Also note that the swap must run on all MPI processes concurrently, although only one actually validates. - swapWithSmoothed(); - - if(isFinal && scheduler_) - scheduler_->validate(graphs_, isFinal); - barrier(); // (for better grouping of log messages) - - std::string modelFileName = options_->get("model"); if(isMainProcess()) { // save main model file - if(options_->get("overwrite")) { - models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true); - // save scheduler-related state - if(scheduler_) - scheduler_->save(modelFileName); - } else { - if(!isFinal) { // save a model with iteration number - std::string numberOfBatches = scheduler_ ? std::to_string(scheduler_->numberOfBatches()) : "unknown"; - std::string nameOverwrite = modelFileName; - nameOverwrite.replace(modelFileName.size() - 4, 4, ".iter" + numberOfBatches + ".npz"); - models_[0]->save(graphs_[0], nameOverwrite); - } - models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true); - - // save scheduler-related state - if(scheduler_) - scheduler_->save(modelFileName); - } + models_[0]->save(graphs_[0], modelFileName, /*saveTranslatorConfig=*/true); + // save scheduler-related state + if(doSaveOptimizerState && scheduler_) + scheduler_->save(modelFileName); } swapWithSmoothed(); - saveCheckpoint(modelFileName, gatherOptimizerStateFn); - + + if(doSaveOptimizerState) + saveOptimizerState(modelFileName, gatherOptimizerStateFn); + barrier(); // (for better grouping of log messages) } +void GraphGroup::save(bool isFinal, + const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) { + if(isFinal && scheduler_) { + barrier(); // (for better grouping of log messages) + swapWithSmoothed(); + scheduler_->validate(graphs_, isFinal); + swapWithSmoothed(); + barrier(); // (for better grouping of log messages) + } + + std::string modelFileName = options_->get("model"); + bool overwrite = options_->get("overwrite", false); + + if(!overwrite && !isFinal) { // save a model with iteration number + std::string numberOfBatches = scheduler_ ? std::to_string(scheduler_->numberOfBatches()) : "unknown"; + std::string nameOverwrite = modelFileName; + nameOverwrite.replace(modelFileName.size() - 4, 4, ".iter" + numberOfBatches + ".npz"); + + bool overwriteCheckpoint = options_->get("overwrite-checkpoint", true); + saveCheckPoint(nameOverwrite, isFinal, /*doSaveOptimizerState=*/!overwriteCheckpoint, gatherOptimizerStateFn); + } + saveCheckPoint(modelFileName, isFinal, /*doSaveOptimizerState=*/true, gatherOptimizerStateFn); +} + void GraphGroup::swapWithSmoothed() { auto swap = [&](size_t i, size_t begin, size_t end) { auto curParam = graphs_[i]->params()->vals()->subtensor(begin, end-begin); diff --git a/src/training/graph_group.h b/src/training/graph_group.h index d7525a102..4cfd079aa 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -104,14 +104,20 @@ class GraphGroup { private: void load(const OptimizerBase::ScatterStateFunc& scatterFn); + + bool loadOptimizerState(const std::string& modelFileName, + const OptimizerBase::ScatterStateFunc& scatterFn); + void save(bool isFinal, const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn); - bool restoreFromCheckpoint(const std::string& modelFileName, - const OptimizerBase::ScatterStateFunc& scatterFn); + void saveCheckPoint(const std::string& modelFileName, + bool isFinal, + bool doSaveOptimizerState, + const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn); - void saveCheckpoint(const std::string& modelFileName, - const OptimizerBase::GatherStateFunc& gatherFn); + void saveOptimizerState(const std::string& modelFileName, + const OptimizerBase::GatherStateFunc& gatherFn); public: // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled). From 09cb320d58ca20713838d8e36ed4670048ed9b42 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 08:41:32 +0100 Subject: [PATCH 244/254] Bump src/3rd_party/sentencepiece from `8dc9172` to `fb6f8e4` (#1000) Bumps [src/3rd_party/sentencepiece](https://github.com/marian-nmt/sentencepiece) from `8dc9172` to `fb6f8e4`. - [Commits](https://github.com/marian-nmt/sentencepiece/compare/8dc9172f88b1d4054ca38de0e5362b2935e9b53f...fb6f8e408d2078ebfedc8ccc33985fef03c50b0e) --- updated-dependencies: - dependency-name: src/3rd_party/sentencepiece dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 8dc9172f8..fb6f8e408 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 8dc9172f88b1d4054ca38de0e5362b2935e9b53f +Subproject commit fb6f8e408d2078ebfedc8ccc33985fef03c50b0e From 9af4740a9524c5611eb7910464f4bb5ab36636e1 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 24 Jul 2023 12:44:21 +0000 Subject: [PATCH 245/254] Merged PR 30415: Fix macOS clang builds This PR explicitly disables server compilation in macOS build with clang. It seems an update to the macos-12 environment provided openssl and boost, which when found by cmake, enables compilation of marian-server, which doesn't work with clang. --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0f19a0f8d..29e8e6219 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -371,7 +371,7 @@ stages: -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ -DCOMPILE_EXAMPLES=on \ - -DCOMPILE_SERVER=on \ + -DCOMPILE_SERVER=off \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on \ From b67489ec50c7b586dc19258be89dfad2eb947003 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 25 Jul 2023 00:13:18 +0000 Subject: [PATCH 246/254] Merged PR 30419: Fix Python modules in GPU regression tests Set compatible versions of Python modules after Cython 3.0 release. --- azure-regression-tests.yml | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index cb3730c19..0448b172a 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -64,6 +64,24 @@ stages: displayName: Collect system info workingDirectory: regression-tests + # Always run regression tests from the master branch + # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library + # This is run at the beginning for easier debugging of the Python environment + - bash: | + set -x + git checkout master + git pull origin master + # Uninstall Cython because the newest 3.0.0 is incompatible with newest available versions of pyyaml and numpy as of July 2023 + python3 -m pip uninstall -y cython + python3 -m pip install 'cython<3' + # These modules will be installed via `make install` below, but Cython needs to be installed before + python3 -m pip install 'pyyaml<6.0.1' 'numpy>=1.22,<2' websocket-client + make install + displayName: Prepare regression tests + env: + AZURE_STORAGE_SAS_TOKEN: $(marian-pub-tests-blob-sas-token) + workingDirectory: regression-tests + # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - bash: | wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - @@ -106,17 +124,6 @@ stages: displayName: Run unit tests workingDirectory: build - # Always run regression tests from the master branch - # The current SAS token will expire on 12/31/2023 and a new one will need to be set in Marian > Pipelines > Library - - bash: | - git checkout master - git pull origin master - make install - displayName: Prepare regression tests - env: - AZURE_STORAGE_SAS_TOKEN: $(marian-pub-tests-blob-sas-token) - workingDirectory: regression-tests - # Continue on error to be able to collect outputs and publish them as an artifact - bash: MARIAN=../build ./run_mrt.sh continueOnError: true From 68cc88f70c1c5011288cb0f9360dd5a8a68a0f7e Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 25 Jul 2023 16:20:26 +0100 Subject: [PATCH 247/254] Fix macOS actions (#1002) --- .github/workflows/macos.yml | 2 +- regression-tests | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index f06eed256..8b992e404 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -30,7 +30,7 @@ jobs: -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ -DCOMPILE_EXAMPLES=on \ - -DCOMPILE_SERVER=on \ + -DCOMPILE_SERVER=off \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on diff --git a/regression-tests b/regression-tests index 89ce02e3a..ab6fd7365 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 89ce02e3a3e5786d7ae7802108f6a0288f70c269 +Subproject commit ab6fd7365f1b40633a1164dd35c6a15b55f2d4d9 From 717d351ca1165e8f640c3d087a01ee52c4d897c4 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 26 Jul 2023 17:13:22 +0000 Subject: [PATCH 248/254] Merged PR 30406: More general fallbacks for diverged training This PR adds `--custom-fallbacks` and generalizes the previous attempt at handling diverged trainings. Now we can specify any number of fallback options that get used in subsequent diverged trainings. E.g. we can restart a training from the last checkpoint by turning off fp16 training and if we still encounter a divergence, we can also lower the learning rate on the next attempt. This would be achieved by adding the following to a config file: ``` custom-fallbacks: - fp16: false precision: [float32, float32] cost-scaling: [] - fp16: false precision: [float32, float32] cost-scaling: [] learn-rate: 0.0001 ``` On the command line we can specify json-style options like `--custom-fallbacks "{fp16: false, precision: [float32, float32], cost-scaling: []}" "{fp16: false, precision: [float32, float32], cost-scaling: [], learn-rate: 0.0001}"` where each string in `"..."` gets parsed to a Yaml list entry. The previous option `--fp16-fallback-to-fp32` is now just an alias for the corresponding `--custom-fallbacks` values (first entry above). Any number of fallbacks can be specified. --- CHANGELOG.md | 1 + VERSION | 2 +- src/common/config_parser.cpp | 19 ++++++- src/common/options.cpp | 94 ++++++++++++++++++++++++++++++- src/common/options.h | 73 ++++++++++++++---------- src/embedder/vector_collector.cpp | 4 +- src/embedder/vector_collector.h | 2 +- src/training/scheduler.h | 32 ++++++++--- src/training/training.h | 59 ++++++++++--------- src/training/training_state.h | 19 ++++--- src/training/validator.h | 2 +- src/translator/scorers.cpp | 4 +- src/translator/translator.h | 4 +- 13 files changed, 229 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79dd3f673..f70f73ab2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Added --custom-fallbacks option that allows to specify a list of option sets that get traversed for subsequent fallbacks upon divergence - Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers. - Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts. - `./marian evaluate` sub command for evaluation with COMET-QE-20, COMET-20 and BLEURT-20 diff --git a/VERSION b/VERSION index 2fc612cb1..fe4dae579 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.9 \ No newline at end of file +v1.12.10 \ No newline at end of file diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 9b36338c1..bad9904f9 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -572,12 +572,29 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") ->implicit_val("8.f 10000 1.f 8.f"); + cli.add>("--throw-on-divergence", "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps " "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations") - ->implicit_val("100 10 3.0f"); + ->implicit_val("1000 10 5.0f"); + cli.add>("--custom-fallbacks", + "List of custom fallback options after divergence. Each caught divergence exception thrown when --throw-on-divergence conditions are met progresses through another fallback. " + "If more exception are caught than fallbacks were specified the process will terminate with an uncaught exception."); + cli.add("--fp16-fallback-to-fp32", "If fp16 training diverges and throws try to continue training with fp32 precision"); + cli.alias("fp16-fallback-to-fp32", "true", [](YAML::Node& config) { + // use default custom-fallbacks to handle DivergenceException for fp16 + config["custom-fallbacks"] = std::vector({ + YAML::Load("{fp16 : false, precision: [float32, float32], cost-scaling: []}") + }); + }); + + // @TODO: implement this next: + // cli.add("--recover-from-fallback-after", + // "Attempt to return to default options once the training has progressed in fallback mode by this many units. " + // "Allowed units are the same as for disp-freq (i.e. (u)pdates, (t)okens, (e)pochs)"); + cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", diff --git a/src/common/options.cpp b/src/common/options.cpp index 59e8420a4..18f5b17d4 100644 --- a/src/common/options.cpp +++ b/src/common/options.cpp @@ -2,6 +2,96 @@ namespace marian { +// name space for helper template specializations +namespace options_helpers { + +// Generic template-based implementation +template +T Get::apply(const Options* opt, const char* const key) { +#if FASTOPT + opt->lazyRebuild(); + ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key); + return opt->fastOptions_[key].as(); +#else + ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key); + return opt->options_[key].as(); +#endif +} + +// Generic template-based implementation +template +T Get::apply(const Options* opt, const char* const key, const T& defaultValue) { +#if FASTOPT + opt->lazyRebuild(); + if(opt->has(key)) + return opt->fastOptions_[key].as(); +#else + if(opt->has(key)) + return opt->options_[key].as(); +#endif + else + return defaultValue; +} + +// specializations for simple types +template struct Get; +template struct Get; +template struct Get; +template struct Get; +template struct Get; +template struct Get; +template struct Get; + +// specialization for vector of simple types +template struct Get>; +template struct Get>; +template struct Get>; +template struct Get>; +template struct Get>; +template struct Get>; +template struct Get>; + +// specializations for std::vector +template <> +std::vector Get>::apply(const Options* opt, const char* const key) { + ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key); + auto vec = opt->options_[key].as>(); + for(auto& node : vec) { + if(node.IsScalar()) + node = YAML::Load(node.as()); + } + return vec; +} + +template <> +std::vector Get>::apply(const Options* opt, const char* const key, const std::vector& defaultValue) { + if(opt->has(key)) + return apply(opt, key); + return defaultValue; +} + +template struct Get>; + +// specializations for YAML::Node +template <> +YAML::Node Get::apply(const Options* opt, const char* const key) { + ABORT_IF(!opt->has(key), "Required option '{}' has not been set", key); + YAML::Node node = opt->options_[key]; + if(node.IsScalar()) + node = YAML::Load(node.as()); + return node; +} + +template <> +YAML::Node Get::apply(const Options* opt, const char* const key, const YAML::Node& defaultValue) { + if(opt->has(key)) + return apply(opt, key); + return defaultValue; +} + +template struct Get; +} + Options::Options() #if FASTOPT : fastOptions_(options_) @@ -16,8 +106,8 @@ Options::Options(const Options& other) : options_(YAML::Clone(other.options_)) {} #endif -Options Options::clone() const { - return Options(*this); // fastOptions_ get set in constructor above +Ptr Options::clone() const { + return New(*this); // fastOptions_ get set in constructor above } YAML::Node Options::cloneToYamlNode() const { diff --git a/src/common/options.h b/src/common/options.h index 992be8760..91ef65f2c 100644 --- a/src/common/options.h +++ b/src/common/options.h @@ -30,6 +30,17 @@ namespace YAML { \ namespace marian { +class Options; + +// helper class to enable template specialization in options.cpp +namespace options_helpers { + template + struct Get { + static T apply(const Options* opt, const char* const key); + static T apply(const Options* opt, const char* const key, const T& defaultValue); + }; +} + /** * Container for options stored as key-value pairs. Keys are unique strings. * This is not thread-safe and locking is the responsibility of the caller. @@ -60,6 +71,8 @@ class Options { public: Options(); + + // This creates a proper clone Options(const Options& other); // constructor with one or more key-value pairs @@ -72,20 +85,34 @@ class Options { Options(const YAML::Node& node) : Options() { merge(node); } - - // constructor that clones and zero or more updates + + template + friend struct options_helpers::Get; + + // Clones current set of options + Ptr clone() const; + + // Clones current set of options and performs zero updates (just calls clone()). + Ptr with() const { + return clone(); + } + + // Clones current set of options and performs one or more updates // options->with("var1", val1, "var2", val2, ...) - template - Ptr with(Args&&... args) const { - auto options = New(*this); - options->set(std::forward(args)...); + template + Ptr with(const std::string& key, T value, Args&&... args) const { + auto options = clone(); + options->set(key, value, std::forward(args)...); return options; } - /** - * @brief Return a copy of the object that can be safely modified. - */ - Options clone() const; + // Clones current set of options and performs zero or more updates from a YAML::Node. + // Matching existing options get overwritten with options from the argument node. + Ptr with(const YAML::Node& node) const { + auto options = clone(); + options->merge(node, /*overwrite=*/true); + return options; + } // Do not allow access to internal YAML object as changes on the outside are difficult to track // and mess with the rebuilding of the fast options lookup. Hence only return a clone which guarentees @@ -129,14 +156,8 @@ class Options { template T get(const char* const key) const { -#if FASTOPT - lazyRebuild(); - ABORT_IF(!has(key), "Required option '{}' has not been set", key); - return fastOptions_[key].as(); -#else - ABORT_IF(!has(key), "Required option '{}' has not been set", key); - return options_[key].as(); -#endif + // this way we can add type-based specialization, e.g. use options_ for YAML::Node and fastOptions_ for other types. See options.cpp + return options_helpers::Get::apply(this, key); } template @@ -145,21 +166,13 @@ class Options { } template - T get(const char* const key, T defaultValue) const { -#if FASTOPT - lazyRebuild(); - if(has(key)) - return fastOptions_[key].as(); -#else - if(has(key)) - return options_[key].as(); -#endif - else - return defaultValue; + T get(const char* const key, const T& defaultValue) const { + // As above, this way we can add type-based specialization, e.g. use options_ for YAML::Node and fastOptions_ for other types. See options.cpp + return options_helpers::Get::apply(this, key, defaultValue); } template - T get(const std::string& key, T defaultValue) const { + T get(const std::string& key, const T& defaultValue) const { return get(key.c_str(), defaultValue); } diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp index eb55779e0..1268de530 100644 --- a/src/embedder/vector_collector.cpp +++ b/src/embedder/vector_collector.cpp @@ -94,7 +94,7 @@ void AveragingVectorCollector::WriteAverage() { Ptr VectorCollector::Create(Ptr options) { std::string average = options->get("average", "skip"); std::string output = options->get("output"); - size_t width = options->get("width", DEFAULT_WIDTH); + size_t width = options->get("width", VectorCollector::DEFAULT_WIDTH); Ptr collector; if(average == "skip") @@ -109,4 +109,6 @@ Ptr VectorCollector::Create(Ptr options) { return collector; } +const size_t VectorCollector::DEFAULT_WIDTH = 4; + } // namespace marian diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h index 3f1f91e0c..6c727203c 100644 --- a/src/embedder/vector_collector.h +++ b/src/embedder/vector_collector.h @@ -14,7 +14,7 @@ namespace marian { // on its binary flag. If binary=false, width can be used to set the number of decimal places. class VectorCollector { public: - static const size_t DEFAULT_WIDTH = 4; + static const size_t DEFAULT_WIDTH; VectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH); VectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH); diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 9c84d1593..f0f39330d 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -30,10 +30,11 @@ class Scheduler : public TrainingObserver { bool first_{true}; // true if this is the first update after renewing the training - bool throwOnDivergence_{false}; // throw an exception if training divergence is detected - size_t lossAvgWindowSlow_{100}; // window size for slow-moving average loss for divergence detection - size_t lossAvgWindowFast_{10}; // window size for fast-moving average loss for divergence detection - float divergenceTolerance_{3.f}; // tolerance for divergence detection as multiples of standard deviation + bool throwOnDivergence_{false}; // throw an exception if training divergence is detected + size_t lossAvgWindowSlow_{1000}; // window size for slow-moving average loss for divergence detection + size_t lossAvgWindowFast_{10}; // window size for fast-moving average loss for divergence detection + float divergenceTolerance_{5.f}; // tolerance for divergence detection as multiples of standard deviation + SchedulingParameter throwAfter_; // for diagnostics only; training will throw if non-zero and training has progressed this far size_t gradientNormAvgWindow_{100}; // window size for recording the exponential average of gradient norms, after this many updates about 90% of the mass comes from this many last updates SchedulingParameter logicalEpoch_; @@ -161,10 +162,17 @@ class Scheduler : public TrainingObserver { lossAvgWindowFast_ = std::stoul(throwParameters[1]); if(throwParameters.size() > 2) divergenceTolerance_ = std::stof(throwParameters[2]); - LOG(info, - "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps " - "vs fast-moving window over {} steps with tolerance of {} sigmas", - lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_); + if(throwParameters.size() > 3) + throwAfter_ = SchedulingParameter::parse(throwParameters[3]); + + LOG(info, + "[scheduler] Divergence detection is enabled for slow-moving averaging window over {} steps " + "vs fast-moving window over {} steps with tolerance of {} sigmas", + lossAvgWindowSlow_, lossAvgWindowFast_, divergenceTolerance_); + + if(throwAfter_) { + LOG(warn, "[scheduler] Diagnostic DivergenceException will be thrown when training reaches {}", (std::string)throwAfter_); + } } // parse logical-epoch parameters @@ -505,6 +513,14 @@ class Scheduler : public TrainingObserver { } } } + + // purely diagnostic. This will throw a divergence exception once the specified training progress has occurred. + if(throwAfter_) { + if(state_->enteredNewPeriodOf(throwAfter_)) { + LOG(warn, "Training reached {}; throwing diagnostic DivergenceException", (std::string)throwAfter_); + throw DivergenceException(state_->lossAvgSlow, state_->lossAvgFast, 0.f); + } + } // log slow-moving exponential average and variance of training cost stats float deltaSlow = currentNormalizedLoss - state_->lossAvgSlow; diff --git a/src/training/training.h b/src/training/training.h index cbca3eff2..e608cd11a 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -45,13 +45,18 @@ class Train : public ModelTask { dataset->prepare(); - // We run training in a do-while loop. It should only restart if a fp16 training run was interrupted + // We run training in a do-while loop. It should only restart if a training run was interrupted // via the throwing of a DivergenceException from training/scheduler.h and if --throw-on-divergence and - // --fp16-fallback-to-fp32 are enabled. - // The repeated training run will continue from last checkpoint (similar to a manually interrupted training) - // but attempt training in fp32. If that training run or any other fp32 training happens to diverge, - // training will exit with an unhandled DivergenceException. This is on purpose to indicate a fatal error. - bool restartTraining; + // custom-fallbacks are specified (directly or the via alias fp16-fallback-to-fp32) otherwise it will die with the rethrown exception. + // The repeated training run will continue from the last checkpoint (similar to a manually interrupted training) + // but attempt training with the options specified in the current fallback. If that training run in turn happens to diverge, + // training will move on to the next defined fallback or exit with an unhandled DivergenceException if there are no more fallbacks. + // The unhandled exception is on purpose to indicate a fatal error. + + auto originalOptions = options_->clone(); // clone in order to keep unaltered option object around + bool restartTraining; // record if training should be restarted after catching a DivergenceException + size_t restartCounter = 0; // count how many restarts occured. Used to progress through the list of fallbacks + do { try { // there will be only one training loop execution unless in special situations, @@ -133,34 +138,28 @@ class Train : public ModelTask { } catch(DivergenceException& e) { // handling divergent training if scheduler is configured // to throw via --throw-on-divergence - if(options_->get("fp16-fallback-to-fp32", false)) { - auto precisions = options_->get>("precision"); - Type parameterType = typeFromString(precisions[0]); - if(parameterType == Type::float16) { - // we diverged, but we were apparently training with fp16 and fallback to fp32 - // is enabled. There is a chance we can rescue the training run by restarting - // from the last checkpoint but using fp32 precision training. - LOG(warn, "Training diverged, but --fp16-fallback-to-fp32 is enabled. " - "Attempting restart from the last checkpoint with fp32 precision."); - - // undo all options that would be set for fp16 training - options_ = options_->with( - "fp16", false, - "precision", std::vector({"float32", "float32"}), - "cost-scaling", std::vector({}) - ); + + // get the list of possible fallback set of options + auto fallbacks = options_->get>("custom-fallbacks", {}); + + // check if we exceeded the number of available fallbacks, if not, take the current one + if(restartCounter < fallbacks.size()) { + auto fallback = fallbacks[restartCounter]; + fallback.SetStyle(YAML::EmitterStyle::Flow); + + // we diverged, but a set of fallback options is specified. There is a chance we can rescue the training run by + // restarting from the last checkpoint with the options from the current fallback. + LOG(warn, "Training diverged, but fallback is enabled. Attempting restart from the last checkpoint with these options: {}", YAML::Dump(fallback)); + + // overwrite all original options with fallback options + options_ = originalOptions->with(fallback); // this gets checked at final do-while condition restartTraining = true; - } else { - // We diverged and fallback is enabled, but we are already training with fp32, - // hence rethrow and let training die with error. - LOG(warn, "Training diverged, rethrowing divergence exception"); - throw e; - } + restartCounter++; } else { - // We diverged and no fallback enabled, hence rethrow and let training die with error. - LOG(warn, "Training diverged, rethrowing divergence exception"); + // we diverged and no fallback is available, hence rethrow and let training die with error. + LOG(warn, "Training diverged and there are either no fallbacks or we exceeded the number of defined fallbacks, rethrowing divergence exception"); throw e; } } diff --git a/src/training/training_state.h b/src/training/training_state.h index 800dd60c7..d034d93a1 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -147,15 +147,20 @@ class TrainingState { // between calls to this. We call it from update(). Unfortunately, newEpoch() // is called at the wrong place for this to work, so SchedulingUnit::epoch is forbidden // for periods. - bool enteredNewPeriodOf(std::string schedulingParam) const { - auto period = SchedulingParameter::parse(schedulingParam); + bool enteredNewPeriodOf(SchedulingParameter schedulingParam) const { // @TODO: adapt to logical epochs - ABORT_IF(period.unit == SchedulingUnit::epochs, + ABORT_IF(schedulingParam.unit == SchedulingUnit::epochs, "Unit {} is not supported for frequency parameters", - schedulingParam); - auto previousProgress = getPreviousProgressIn(period.unit); - auto progress = getProgressIn(period.unit); - return period && progress / period.n != previousProgress / period.n; + (std::string)schedulingParam); + auto previousProgress = getPreviousProgressIn(schedulingParam.unit); + auto progress = getProgressIn(schedulingParam.unit); + return schedulingParam && progress / schedulingParam.n != previousProgress / schedulingParam.n; + } + + // std::string version of the above function + bool enteredNewPeriodOf(std::string schedulingParam) const { + SchedulingParameter parsedSchedulingParam = SchedulingParameter::parse(schedulingParam); + return enteredNewPeriodOf(parsedSchedulingParam); } void newEpoch() { diff --git a/src/training/validator.h b/src/training/validator.h index aed710778..364c3893d 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -59,7 +59,7 @@ class Validator : public ValidatorBase { : ValidatorBase(lowerIsBetter, epsilon), vocabs_(vocabs), // options_ is a clone of global options, so it can be safely modified within the class - options_(New(options->clone())) { + options_(options->clone()) { // set options common for all validators options_->set("inference", true); options_->set("shuffle", "none"); // don't shuffle validation sets diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp index 60ec03dd1..7c9745c22 100644 --- a/src/translator/scorers.cpp +++ b/src/translator/scorers.cpp @@ -60,7 +60,7 @@ std::vector> createScorers(Ptr options, const std::vector(options->clone()); + auto modelOptions = options->clone(); if(!options->get("ignore-model-config")) { YAML::Node modelYaml; io::getYamlFromModel(modelYaml, "special:model.yml", items); @@ -115,7 +115,7 @@ std::vector> createScorers(Ptr options, const std::vector(options->clone()); + auto modelOptions = options->clone(); if(!options->get("ignore-model-config")) { YAML::Node modelYaml; io::getYamlFromModel(modelYaml, "special:model.yml", ptr); diff --git a/src/translator/translator.h b/src/translator/translator.h index 205c213cb..f0fc0b908 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -42,7 +42,7 @@ class Translate : public ModelTask { public: Translate(Ptr options) - : options_(New(options->clone())) { // @TODO: clone should return Ptr same as "with"? + : options_(options->clone()) { // This is currently safe as the translator is either created stand-alone or // or config is created anew from Options in the validator @@ -252,7 +252,7 @@ class TranslateService : public ModelServiceTask { virtual ~TranslateService() {} TranslateService(Ptr options) - : options_(New(options->clone())) { + : options_(options->clone()) { // initialize vocabs options_->set("inference", true); options_->set("shuffle", "none"); From e383583ae5b0f2f82cedb06c4cd7c5f036fb90a3 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 27 Jul 2023 17:07:28 +0000 Subject: [PATCH 249/254] Merged PR 30482: Fixes for backward compatibility in fine-tuning This PR fixes fine-tuning a model trained with an older version of Marian by: - adding the removed option `num-devices` to the list of deprecated options - checking if `loss-{arg,var}-{slow,fast}` are present in .progress.yml file --- VERSION | 2 +- src/common/cli_wrapper.cpp | 1 + src/training/training_state.h | 10 +++++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index fe4dae579..e47557093 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.10 \ No newline at end of file +v1.12.11 diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp index fee50a2cb..343ff2ba2 100644 --- a/src/common/cli_wrapper.cpp +++ b/src/common/cli_wrapper.cpp @@ -13,6 +13,7 @@ namespace cli { const std::unordered_set DEPRECATED_OPTIONS = { "version", "special-vocab", + "num-devices", // @TODO: uncomment once we actually deprecate them. // "after-batches", // "after-epochs" diff --git a/src/training/training_state.h b/src/training/training_state.h index d034d93a1..c522caa85 100644 --- a/src/training/training_state.h +++ b/src/training/training_state.h @@ -209,6 +209,10 @@ class TrainingState { void loadFromString(const std::string& yamlString) { YAML::Node config = YAML::Load(yamlString); + // WARNING! When adding new options to the training state, make sure to + // check of their existance when loading from the progress.yml + // file for backward compatibility + epochs = config["epochs"].as(); batches = config["batches"].as(); batchesEpoch = config["batches-epoch"].as(); @@ -241,9 +245,9 @@ class TrainingState { samplesDisp = config["disp-samples"].as(); updatesDisp = config["disp-updates"].as(); - lossAvgSlow = config["loss-avg-slow"].as(); - lossAvgFast = config["loss-avg-fast"].as(); - lossVarSlow = config["loss-var-slow"].as(); + lossAvgSlow = config["loss-avg-slow"] ? config["loss-avg-slow"].as() : 0; + lossAvgFast = config["loss-avg-fast"] ? config["loss-avg-fast"].as() : 0; + lossVarSlow = config["loss-var-slow"] ? config["loss-var-slow"].as() : 0; gradientNormAvg = config["gradient-norm-avg"].as(); gradientNormVar = config["gradient-norm-var"].as(); From 3bd25dd59ed118f7433b2692b314f211c72b578c Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 31 Jul 2023 08:03:37 +0000 Subject: [PATCH 250/254] Merged PR 30516: Make sure that loss is finite when checking for divergence Make sure that the averaged loss is actually well-defined and not inf or nan. --- VERSION | 2 +- src/common/definitions.h | 10 +++++++++- src/training/graph_group.h | 8 -------- src/training/scheduler.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/VERSION b/VERSION index e47557093..dc5ef6d14 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.11 +v1.12.12 diff --git a/src/common/definitions.h b/src/common/definitions.h index e28ea5dcf..37213d37a 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -193,6 +193,14 @@ typedef Ptr ClipperBasePtr; class RunBase; typedef Ptr RunBasePtr; - const float NEMATUS_LN_EPS = 1e-5f; + +// With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here. +static inline bool isFinite(float x) { +#ifdef __GNUC__ + ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option."); +#endif + return std::isfinite(x); +} + } // namespace marian diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 4cfd079aa..b0c98e3ce 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -11,14 +11,6 @@ namespace marian { -// With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here. -static inline bool isFinite(float x) { -#ifdef __GNUC__ - ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option."); -#endif - return std::isfinite(x); -} - #ifdef _MSC_VER // MS Visual studio insists that this funtion is not being referenced although is being referenced by name as an argument #pragma warning(push) #pragma warning(disable: 4505) //Unreferenced local function has been removed diff --git a/src/training/scheduler.h b/src/training/scheduler.h index f0f39330d..df902e6ef 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -466,7 +466,7 @@ class Scheduler : public TrainingObserver { state_->newUpdate(numReadBatches); // true if --throw-on-divergence [lossAvgWindowSlow_] [lossAvgWindowFast_] [divergenceTolerance_] is enabled, false otherwise - if(throwOnDivergence_) { + if(throwOnDivergence_ && isFinite(currentNormalizedLoss)) { size_t windowSlow = std::min(lossAvgWindowSlow_, state_->batches); // we compare the running exponential average over a longer window size_t windowFast = std::min(lossAvgWindowFast_, state_->batches); // with the running exponential everage over a shorter window (for smoothing) From 60aa66bab9e45214fd0f4760bad27f7785ed2ddc Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 14 Aug 2023 21:41:08 +0000 Subject: [PATCH 251/254] Merged PR 30704: Merge with public master from 20230814 --- .github/workflows/macos.yml | 2 +- cmake/FindSSE.cmake | 30 ++++++++++----------- examples | 2 +- regression-tests | 2 +- src/3rd_party/fbgemm | 2 +- src/3rd_party/sentencepiece | 2 +- src/onnx/expression_graph_onnx_exporter.cpp | 2 +- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index f06eed256..8b992e404 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -30,7 +30,7 @@ jobs: -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ -DCOMPILE_EXAMPLES=on \ - -DCOMPILE_SERVER=on \ + -DCOMPILE_SERVER=off \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on diff --git a/cmake/FindSSE.cmake b/cmake/FindSSE.cmake index e1c58fbc9..0f1483487 100644 --- a/cmake/FindSSE.cmake +++ b/cmake/FindSSE.cmake @@ -4,7 +4,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) - STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") @@ -13,14 +13,14 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ENDIF (SSE2_TRUE) # /proc/cpuinfo apparently omits sse3 :( - STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE) IF (NOT SSE3_TRUE) - STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE) ENDIF (NOT SSE3_TRUE) - STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE) IF (SSE3_TRUE OR SSSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") @@ -33,7 +33,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) - STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") @@ -41,7 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) - STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE) IF (SSE42_TRUE) set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host") @@ -49,7 +49,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host") ENDIF (SSE42_TRUE) - STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx" "${SSE_THERE}" AVX_TRUE) IF (AVX_TRUE) set(AVX_FOUND true CACHE BOOL "AVX available on host") @@ -57,7 +57,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(AVX_FOUND false CACHE BOOL "AVX available on host") ENDIF (AVX_TRUE) - STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx2" "${SSE_THERE}" AVX2_TRUE) IF (AVX2_TRUE) set(AVX2_FOUND true CACHE BOOL "AVX2 available on host") @@ -65,7 +65,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(AVX2_FOUND false CACHE BOOL "AVX2 available on host") ENDIF (AVX2_TRUE) - STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE) IF (AVX512_TRUE) set(AVX512_FOUND true CACHE BOOL "AVX512 available on host") @@ -76,7 +76,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features machdep.cpu.leaf7_features" OUTPUT_VARIABLE CPUINFO) - STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE) IF (SSE2_TRUE) set(SSE2_FOUND true CACHE BOOL "SSE2 available on host") @@ -84,7 +84,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSE2_FOUND false CACHE BOOL "SSE2 available on host") ENDIF (SSE2_TRUE) - STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE) IF (SSE3_TRUE) set(SSE3_FOUND true CACHE BOOL "SSE3 available on host") @@ -100,7 +100,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host") ENDIF (SSSE3_TRUE) - STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE) IF (SSE41_TRUE) set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host") @@ -108,7 +108,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host") ENDIF (SSE41_TRUE) - STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(AVX).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "AVX" "${SSE_THERE}" AVX_TRUE) IF (AVX_TRUE) set(AVX_FOUND true CACHE BOOL "AVX available on host") @@ -116,7 +116,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(AVX_FOUND false CACHE BOOL "AVX available on host") ENDIF (AVX_TRUE) - STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(AVX2).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "AVX2" "${SSE_THERE}" AVX2_TRUE) IF (AVX2_TRUE) set(AVX2_FOUND true CACHE BOOL "AVX2 available on host") @@ -124,7 +124,7 @@ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") set(AVX2_FOUND false CACHE BOOL "AVX2 available on host") ENDIF (AVX2_TRUE) - STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE ${CPUINFO}) + STRING(REGEX REPLACE "^.*(avx512).*$" "\\1" SSE_THERE "${CPUINFO}") STRING(COMPARE EQUAL "avx512" "${SSE_THERE}" AVX512_TRUE) IF (AVX512_TRUE) set(AVX512_FOUND true CACHE BOOL "AVX512 available on host") diff --git a/examples b/examples index 58f48a067..6c40475a9 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 58f48a06756c623fe799613134810322e061863f +Subproject commit 6c40475a9cbdcc219d0b6a8347ae43902204eedc diff --git a/regression-tests b/regression-tests index 2a8bed3f0..ab6fd7365 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 2a8bed3f0e937a9de2d6fa92dee3bcf482d3d47b +Subproject commit ab6fd7365f1b40633a1164dd35c6a15b55f2d4d9 diff --git a/src/3rd_party/fbgemm b/src/3rd_party/fbgemm index 6f45243cb..0e33146d3 160000 --- a/src/3rd_party/fbgemm +++ b/src/3rd_party/fbgemm @@ -1 +1 @@ -Subproject commit 6f45243cb8ab7d7ab921af18d313ae97144618b8 +Subproject commit 0e33146d3e7f070c7de9494efef49147a9d20558 diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 8dc9172f8..fb6f8e408 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 8dc9172f88b1d4054ca38de0e5362b2935e9b53f +Subproject commit fb6f8e408d2078ebfedc8ccc33985fef03c50b0e diff --git a/src/onnx/expression_graph_onnx_exporter.cpp b/src/onnx/expression_graph_onnx_exporter.cpp index d27f1360c..8e6625a42 100644 --- a/src/onnx/expression_graph_onnx_exporter.cpp +++ b/src/onnx/expression_graph_onnx_exporter.cpp @@ -5,7 +5,7 @@ #include "models/model_factory.h" #include "models/encoder_decoder.h" #include "data/corpus_base.h" -#include "tensors/cpu/fbgemm/expression_graph_packable.h" +#include "tensors/cpu/expression_graph_packable.h" #include From 3f93e656ea6be6f9a8816fa696ba7c435343fc2e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 15 Aug 2023 12:55:24 -0700 Subject: [PATCH 252/254] don't include nppdefs.h. Problematic on some machines (#1004) Co-authored-by: Hieu Hoang --- src/tensors/gpu/tensor_operators.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 5f8c4c122..6dbded2a4 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1,7 +1,5 @@ -# if defined(_MSC_VER) +# if !defined(NPP_MAX_32U) #define NPP_MAX_32U ( 4294967295U ) /**< Maximum 32-bit unsigned integer */ -#else -#include #endif #include "common/types.h" @@ -3548,7 +3546,7 @@ __global__ void HammmingAndSort(const uint32_t *weightHash, if (outIdx != NPP_MAX_32U) { uint32_t prevOutIdx; // Not supported in Maxwells or older -// Not supported in Maxwells or older +// Not supported in Maxwells or older #if __CUDA_ARCH__ >= 600 prevOutIdx = atomicAdd_block(&outIdx, (uint32_t) -1); #else From 961a728857ac5258a7a38bcaef900992eb44e06c Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Thu, 17 Aug 2023 13:24:55 +0300 Subject: [PATCH 253/254] =?UTF-8?q?Add=20an=20option=20to=20not=20encode?= =?UTF-8?q?=20sentencepiece=20during=20training/decoding=20al=E2=80=A6=20(?= =?UTF-8?q?#1003)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add an option to not encode sentencepiece during training/decoding allowing passing of spmIDs directly * Update changelog * numbers -> pieces --- CHANGELOG.md | 1 + src/common/config_parser.cpp | 6 ++++++ src/data/sentencepiece_vocab.cpp | 32 ++++++++++++++++++++++---------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f70f73ab2..a50945fb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode. - Added --custom-fallbacks option that allows to specify a list of option sets that get traversed for subsequent fallbacks upon divergence - Added --overwrite-checkpoint option that (when set to false) can be used to dump checkpoints with iteration numbers. - Implementations of COMET-20 (reference-based) and BLEURT-20 for inference with conversion scripts. diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index bad9904f9..a6e38792f 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -411,6 +411,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. " "When set to 0 all lines are going to be used.", 2000000); + cli.add("--no-spm-encode", + "Assume the input has already had sentencepiece applied before decoding. " + "Expects spm pieces, like the ones produced by spm_encode's default format."); #endif // scheduling options @@ -752,6 +755,9 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { #ifdef USE_SENTENCEPIECE cli.add("--no-spm-decode", "Keep the output segmented into SentencePiece subwords"); + cli.add("--no-spm-encode", + "Assume the input has already had sentencepiece applied before decoding. " + "Expects spm pieces, like the ones produced by spm_encode's default format."); #endif addSuboptionsInputLength(cli); diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index dc06cc17b..548b95a46 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -39,6 +39,9 @@ class SentencePieceVocab : public IVocab { // Keeps sentences segmented into subword units bool keepEncoded_{false}; + // Assume sentencepiece has already been applied and we are expecting spm pieces as input + bool noEncode_{false}; + // Contains control characters added to vocab due to byte-fallback std::vector controlChars_; @@ -127,7 +130,8 @@ class SentencePieceVocab : public IVocab { : options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed), - keepEncoded_(options->get("no-spm-decode", false)) { + keepEncoded_(options->get("no-spm-decode", false)), + noEncode_(options->get("no-spm-encode", false)) { if(options_->has("sentencepiece-alphas")) { auto alphas = options_->get>("sentencepiece-alphas"); if(alphas.size() <= batchIndex) @@ -221,16 +225,24 @@ class SentencePieceVocab : public IVocab { } Words encode(const std::string& line, bool addEOS, bool inference) const override { - std::vector spmIds; - if(inference || alpha_ == 0) - spm_->Encode(line, &spmIds); - else - spm_->SampleEncode(line, -1, alpha_, &spmIds); - Words words; - words.reserve(spmIds.size() + addEOS); - for (auto&& spmId : spmIds) - words.push_back(Word::fromWordIndex(spmId)); + if (noEncode_) { + auto lineTokens = utils::split(line, " "); + words.reserve(lineTokens.size() + addEOS); + for (auto&& token : lineTokens) { + words.push_back((*this)[token]); + } + } else { + std::vector spmIds; + if(inference || alpha_ == 0) + spm_->Encode(line, &spmIds); + else + spm_->SampleEncode(line, -1, alpha_, &spmIds); + + words.reserve(spmIds.size() + addEOS); + for (auto&& spmId : spmIds) + words.push_back(Word::fromWordIndex(spmId)); + } if(addEOS) words.push_back(getEosId()); From 56abb91fe8d6c6aece2c34b7958576b485e8d495 Mon Sep 17 00:00:00 2001 From: Samir Salman Date: Thu, 24 Aug 2023 10:50:41 +0200 Subject: [PATCH 254/254] fix syntax error --- src/common/config_parser.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index cfe8d91c4..d91729b34 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -653,11 +653,9 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { cli.add("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); - cli.add("--valid-from", - "Validate model not before arg updates (append 't' for every arg target labels)", - "0u"); - "Stop if the first validation metric does not improve for arg consecutive validation steps", - 10); + cli.add("--valid-from", + "Validate model not before arg updates (append 't' for every arg target labels)", + "0u"); cli.add>("--early-stopping-epsilon", "An improvement lower than or equal to arg does not prevent stalled validation. " "i-th value corresponds to i-th metric in --valid-metrics",