Skip to content

Commit

Permalink
merged with master
Browse files Browse the repository at this point in the history
  • Loading branch information
emjotde committed Feb 6, 2022
1 parent 266b931 commit 8da539e
Show file tree
Hide file tree
Showing 37 changed files with 849 additions and 340 deletions.
40 changes: 20 additions & 20 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
# 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file
# 4. "More actions" > "Save"

parameters:
# Allow skipping the entire 'Build' stage
- name: runBuilds
displayName: Run builds? Uncheck to run regression tests only.
type: boolean
default: true

# The pipeline CI trigger is set on the branch master only and PR trigger on a
# (non-draft) pull request to any branch
trigger:
Expand Down Expand Up @@ -45,6 +52,7 @@ stages:

######################################################################
- job: BuildWindows
condition: eq(${{ parameters.runBuilds }}, true)
displayName: Windows

strategy:
Expand Down Expand Up @@ -180,6 +188,7 @@ stages:
######################################################################
- job: BuildUbuntu
condition: eq(${{ parameters.runBuilds }}, true)
displayName: Ubuntu
timeoutInMinutes: 90

Expand Down Expand Up @@ -237,17 +246,7 @@ stages:
examples: true
static: true
################################################################
# Ubuntu 16.04 supports CUDA 8+
"16.04 CUDA 9.2 gcc-7":
image: ubuntu-16.04
boost: true
cpu: true
gpu: true
cuda: 9.2
gcc: 7
unit_tests: true
examples: true
static: false
# Ubuntu 16.04 is no longer available on Azure-hosted machines

pool:
vmImage: $(image)
Expand Down Expand Up @@ -322,18 +321,17 @@ stages:
######################################################################
- job: BuildUbuntuMinimal
displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
condition: eq(${{ parameters.runBuilds }}, true)
displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5

pool:
vmImage: ubuntu-16.04
vmImage: ubuntu-18.04

steps:
- checkout: self
submodules: true

# The script simplifies installation of different versions of CUDA.
# Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9.
# Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work...
- bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
displayName: Install CUDA

Expand All @@ -346,10 +344,10 @@ stages:
# GCC 5 is the minimum version supported
- bash: |
/usr/bin/gcc-5 --version
/usr/bin/gcc-7 --version
mkdir -p build
cd build
CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \
CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
-DCOMPILE_CPU=on \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
Expand All @@ -368,10 +366,11 @@ stages:
######################################################################
- job: BuildMacOS
condition: eq(${{ parameters.runBuilds }}, true)
displayName: macOS CPU clang

pool:
vmImage: macos-latest
vmImage: macos-10.15

steps:
- checkout: self
Expand Down Expand Up @@ -416,6 +415,7 @@ stages:
######################################################################
- job: BuildInstall
condition: eq(${{ parameters.runBuilds }}, true)
displayName: Linux CPU library install

pool:
Expand Down Expand Up @@ -580,7 +580,7 @@ stages:
# Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\'
# instead of '/', which often breaks the job
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
- bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops'
continueOnError: true
displayName: Run tests
workingDirectory: marian-prod-tests
Expand Down Expand Up @@ -677,7 +677,7 @@ stages:
AWS_SECRET_SAS_TOKEN: $(blob-sas-token)
workingDirectory: marian-prod-tests
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops'
continueOnError: true
displayName: Run tests
workingDirectory: marian-prod-tests
Expand Down
4 changes: 2 additions & 2 deletions src/common/aliases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
cli.alias("fp16", "true", [&](YAML::Node& config) {
if(mode_ == cli::mode::training) {
config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
// scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"});
// scaling factor, frequency, multiplier at increase, minium scaling factor
config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
} else {
config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
}
Expand Down
28 changes: 21 additions & 7 deletions src/common/config_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
"Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
cli.add<int>("--transformer-dim-ffn",
"Size of position-wise feed-forward network (transformer)",
2048);
2048);
cli.add<int>("--transformer-decoder-dim-ffn",
"Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
0);
cli.add<int>("--transformer-ffn-depth",
"Depth of filters (transformer)",
2);
cli.add<int>("--transformer-decoder-ffn-depth",
"Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
0);
cli.add<std::string>("--transformer-ffn-activation",
"Activation between filters: swish or relu (transformer)",
"swish");
Expand Down Expand Up @@ -528,15 +534,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
// mixed precision training
cli.add<bool>("--fp16",
"Shortcut for mixed precision training with float16 and cost-scaling, "
"corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
"corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
cli.add<std::vector<std::string>>("--precision",
"Mixed precision training for forward/backward pass and optimizaton. "
"Defines types for: forward/backward pass, optimization.",
{"float32", "float32"});
cli.add<std::vector<std::string>>("--cost-scaling",
"Dynamic cost scaling for mixed precision training: "
"power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
"scaling factor, frequency, multiplier, minimum factor")
->implicit_val("256.f 1000 2.f 256.f");
cli.add<size_t>("--gradient-norm-average-window",
"Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
"After this many updates about 90% of the mass of the exponential average comes from these updates",
Expand Down Expand Up @@ -702,9 +708,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Use softmax shortlist: path first best prune");
cli.add<std::vector<float>>("--weights",
"Scorer weights");
cli.add<bool>("--output-sampling",
"Noise output layer with gumbel noise",
false);
cli.add<std::vector<std::string>>("--output-sampling",
"Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
" Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
->implicit_val("full");
cli.add<std::vector<int>>("--output-approx-knn",
"Use approximate knn search in output layer (currently only in transformer)")
->implicit_val("100 1024");
Expand Down Expand Up @@ -889,6 +896,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
if(mode_ == cli::mode::training) {
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");

cli.add<size_t>("--data-threads",
"Number of concurrent threads to use during data reading and processing", 1);

// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
Expand All @@ -907,6 +918,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
cli.add<bool>("--mini-batch-round-up",
"Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
true);
} else {
cli.add<size_t>("--data-threads",
"Number of concurrent threads to use during data reading and processing", 1);
}
// clang-format on
}
Expand Down
10 changes: 5 additions & 5 deletions src/common/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
/** @brief Creates shared_ptr of any type, passes all arguments to any available
* constructor */
template <class T, typename... Args>
Ptr<T> New(Args&&... args) {
return Ptr<T>(new T(std::forward<Args>(args)...));
inline Ptr<T> New(Args&&... args) {
return std::make_shared<T>(std::forward<Args>(args)...);
}

template <class T>
Ptr<T> New(Ptr<T> p) {
inline Ptr<T> New(Ptr<T> p) {
return Ptr<T>(p);
}

/** @brief Creates InstrusivePtr of any type, passes all arguments to any available
* constructor */
template <class T, typename... Args>
IPtr<T> INew(Args&&... args) {
inline IPtr<T> INew(Args&&... args) {
return IPtr<T>(new T(std::forward<Args>(args)...));
}

template <class T>
IPtr<T> INew(Ptr<T> p) {
inline IPtr<T> INew(Ptr<T> p) {
return IPtr<T>(p);
}

Expand Down
8 changes: 3 additions & 5 deletions src/common/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,20 @@ void split(const std::string& line,
// the function guarantees that the output has as many elements as requested
void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
fields.clear();
fields.resize(numFields); // make sure there is as many elements as requested

size_t begin = 0;
size_t pos = 0;
for(size_t i = 0; i < numFields; ++i) {
pos = line.find('\t', begin);
if(pos == std::string::npos) {
fields.push_back(line.substr(begin));
fields[i] = line.substr(begin);
break;
}
fields.push_back(line.substr(begin, pos - begin));
fields[i] = line.substr(begin, pos - begin);
begin = pos + 1;
}

if(fields.size() < numFields) // make sure there is as many elements as requested
fields.resize(numFields);

ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
}

Expand Down
35 changes: 23 additions & 12 deletions src/data/batch_generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "common/options.h"
#include "common/signal_handling.h"
#include "common/timer.h"
#include "data/batch_stats.h"
#include "data/rng_engine.h"
#include "training/training_state.h"
Expand Down Expand Up @@ -92,6 +93,8 @@ class BatchGenerator : public RNGEngine {

// this runs on a bg thread; sequencing is handled by caller, but locking is done in here
std::deque<BatchPtr> fetchBatches() {
timer::Timer total;

typedef typename Sample::value_type Item;
auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content

Expand Down Expand Up @@ -135,19 +138,29 @@ class BatchGenerator : public RNGEngine {
if(current_ != data_->end())
++current_;
}
size_t sets = 0;
while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data

Samples maxiBatchTemp;
while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
maxiBatch->push(*current_);
sets = current_->size();

maxiBatchTemp.push_back(*current_);

// do not consume more than required for the maxi batch as this causes
// that line-by-line translation is delayed by one sentence
bool last = maxiBatch->size() == maxSize;
bool last = maxiBatchTemp.size() == maxSize;
if(!last)
++current_; // this actually reads the next line and pre-processes it
}
size_t numSentencesRead = maxiBatch->size();
size_t numSentencesRead = maxiBatchTemp.size();

size_t sets = 0;
for(auto&& s : maxiBatchTemp) {
if(!s.empty()) {
sets = s.size();
maxiBatch->push(s);
}
}

// construct the actual batches and place them in the queue
Samples batchVector;
Expand All @@ -163,6 +176,7 @@ class BatchGenerator : public RNGEngine {
BatchStats::const_iterator cachedStatsIter;
if (stats_)
cachedStatsIter = stats_->begin();

while(!maxiBatch->empty()) { // while there are sentences in the queue
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
Expand All @@ -178,12 +192,7 @@ class BatchGenerator : public RNGEngine {
lengths[i] = batchVector.back()[i].size(); // record max lengths so far

maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter);
// this optimization makes no difference indeed
#if 0 // sanity check: would we find the same entry if searching from the start?
auto it = stats_->lower_bound(lengths);
auto maxBatchSize1 = stats_->findBatchSize(lengths, it);
ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked");
#endif

makeBatch = batchVector.size() >= maxBatchSize;
// if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch
if(batchVector.size() > maxBatchSize) {
Expand Down Expand Up @@ -231,6 +240,8 @@ class BatchGenerator : public RNGEngine {
LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
tempBatches.size(), numSentencesRead,
(double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed());

return tempBatches;
}

Expand Down
Loading

0 comments on commit 8da539e

Please sign in to comment.