sample_data/sample_config.ini

[Global]

; Size of the reserved memory in MiB to be used for the forward calculation.
forward_memory_mb=512

; Size of the reserved memory in MiB to be used for the backward calculation.
; Basically this value could be specified as same as `forward_memory_mb`.
backward_memory_mb=512

; Size of the reserved memory in MiB to be used for the network parameters.
parameter_memory_mb=1024

; Seed value for the internal randomizer of NMTKit.
; If this value is 0, NMTKit automatically chooses an actual seed randomly.
random_seed=0

; Seed value for the neural network backend.
; If this value is 0, NMTKit automatically chooses an actual seed randomly.
backend_random_seed=0

; File format to save models. Available options:
; * binary ... boost::binary_[io]archive
; * text ..... boost::text_[io]archive
archive_format=binary


[Corpus]

; Following parameters could be specified using both abstract/relative paths.
; If users used relative paths, the "current directory" is used as the root
; location.

; Location of the parallel corpus for the training.
train_source=submodules/small_parallel_enja/train.en
train_target=submodules/small_parallel_enja/train.ja

; Location of the parallel corpus for the parameter validation.
dev_source=submodules/small_parallel_enja/dev.en
dev_target=submodules/small_parallel_enja/dev.ja

; Location of the parallel corpus for the testing.
test_source=submodules/small_parallel_enja/test.en
test_target=submodules/small_parallel_enja/test.ja


[Model]

; Vocabulary type in each side. Available options:
; * word ........ UTF-8 whitespace-separated words.
; * bpe ......... Byte Pair Encoding based on Sennrich et al. 2015.
; * character ... UTF-8 letters including whitespaces.
source_vocabulary_type=word
target_vocabulary_type=word

; Vocabulary size in each side.
source_vocabulary_size=4100
target_vocabulary_size=4900

; Name of the encoder strategy. Available options:
; * bidirectional ... Bidirectional RNN.
; * forward ......... Forward RNN.
; * backward ........ Backward RNN.
encoder_type=bidirectional

; Name of the decoder strategy. Available options:
; * default .... Default RNN decoder (similar to Stanford IWSLT15 model).
; * bahdanau ... Bahdanau+2014 style decoder.
; * luong ...... Luong+2015 style decoder.
decoder_type=luong

; Depth of encoder/decoder stacks.
num_layers=1

; Number of units in each embedding layer.
source_embedding_size=512
target_embedding_size=512
output_embedding_size=512

; Number of units in each RNN hidden layer.
; These values are basically not equal to the actual hidden layer sizes, and
; they are specified by both these values and encoder/decoder implementations.
encoder_hidden_size=512
decoder_hidden_size=512

; Name of the attention strategy. Available options:
; * mlp ........ Multilayer perceptron-based model.
;                (proposed in [Bahdanau+14])
; * bilinear ... Bilinear-based model.
;                (proposed as the "general" method in [Luong+15])
attention_type=mlp

; Number of units in attention hidden layer.
; Currently, this value is used only in the "mlp" method.
attention_hidden_size=512

; Name of the prediction strategy. Available options:
; * softmax ... Softmax prediction.
predictor_type=softmax


[Batch]

; Name of the method to make batch data. Available options:
; * sentence ...... Make batch data according to the number of sentences.
; * both_word ..... Make batch data according to the number of both source and
;                   target words.
; * source_word ... Make batch data according to the number of source words.
; * target_word ... Make batch data according to the number of target words.
batch_method=target_word

; Name of the method to sort training corpus. Available options:
; * none ............ Never sort the corpus.
; * source .......... Sort by source lengths.
; * target .......... Sort by target lengths.
; * source_target ... First sort by source lengths, then sort by target
;                     lengths with maintaining the order of source lengths.
; * target_source ... First sort by target lengths, then sort by source
;                     lengths with maintaining the order of target lengths.
sort_method=target_source

; Maximum size of the batch data.
; The meaning of this value is determined by the `batch_method` option.
batch_size=1024

; Maximum number of words in the source/target sentences.
max_length=20

; Maximum ratio of the lengths between source/target sentences.
max_length_ratio=3.0


[Train]

; Name of the optimizer. Available options:
; * sgd ........ Simple stochastic gradient descent
; * momentum ... SGD with momentum.
; * adagrad .... AdaGrad optimizer.
; * adadelta ... AdaDelta optimizer.
; * adam ....... Adam optimizer.
optimizer_type=adam

; Hyperparameters for SGD and SGD with momentum.
sgd_eta=0.1
sgd_momentum=0.9

; Hyperparameters for AdaGrad.
adagrad_eta=0.1
adagrad_eps=1e-20

; Hyperparameters for AdaDelta.
adadelta_eps=1e-6
adadelta_rho=0.95

; Hyperparameters for Adam.
adam_alpha=0.001
adam_beta1=0.9
adam_beta2=0.999
adam_eps=1e-8

; Loss integration strategy. Available options:
; * sum .... Sum all loss values in a batch.
; * mean ... Sum all loss values in a batch, then divide the result by the
;            number of sentences in the batch.
loss_integration_type=sum

; Clip the gradient if the L-2 norm of the gradient is larger than this threshold.
; gradient_clipping should be greater than 0.0
; Set 0.0 to disable this.
gradient_clipping=5.0

; Policy of decaying learning rate. Available options:
; * none ..... Never updates learning rate.
; * batch .... Updates learning rate after each batch calculation.
; * eval ..... Updates learning rate after each evaluation.
; * logppl ... Updates learning rate when the log perplexity becomes worse.
; * bleu ..... Updates learning rate when the BLEU score becomes worse.
lr_decay_type=logppl

; Amount of the learning rate decaying. This value would be multiplied in each
; decaying operation.
lr_decay_ratio=0.5

; Dropout probability for vertical connections of the RNN layers.
dropout_ratio=0.3

; Maximum number of batch data to be trained.
max_iteration=10000

; Timing of evaluating dev/test set. Available options:
; * step .... Number of steps (iterations).
; * sample .... Number of samples (sentences).
; * corpus .... Number of epochs (corpus).
; * word .... Number of processed target words.
; * time .... Processing time (minutes).
evaluation_type=step
; Evaluation (validation/testing) processes would be performed in each following
; step/sample/corpus/word/minute.
evaluation_interval=100