Merge pull request #357 from ufal/examples_ordnung

Examples ordnung
ufal · Mar 11, 2017 · 3468c45 · 3468c45
2 parents 87aea97 + 80bf74d
commit 3468c45
Show file tree

Hide file tree

Showing 20 changed files with 216 additions and 261 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,4 @@ tests/*.en
 tests/*.de
 .idea
 tmp-*
+out-example-*
diff --git a/examples/README.md b/examples/README.md
@@ -4,8 +4,17 @@ This directory contains example configuration files for toy experiments with
 Neuralmonkey.
 
 These experiments use example data, which can be downloaded by the download
-script inside the data subdirectory. To prepare the data, just run the script
-from inside the `data` directory.
+script inside the example's dedicated data subdirectory. To prepare the data
+for e.g. the tagging example, just run the script from inside the
+`data/tagging` directory.
 
 - `translation.ini` an example translation model. For running this model on
   GPU, it should have at least 6 GB RAM.
+
+- `tagging.ini` an example part-of-speech tagger model.
+
+- `language_model.ini` and example INI for training a language model.
+
+There are also additional examples in the `_old` directory but they have not
+been updated for a while and therefore won't be functional without further
+edits.
diff --git a/examples/factored.ini → examples/_old/factored.ini b/examples/factored.ini → examples/_old/factored.ini
diff --git a/examples/multiobjective.ini → examples/_old/multiobjective.ini b/examples/multiobjective.ini → examples/_old/multiobjective.ini
diff --git a/examples/_old/multiobjective_readme.txt b/examples/_old/multiobjective_readme.txt
@@ -0,0 +1,18 @@
+
+How to get PCEDT 2.0 data for example tagger
+============================================
+
+For the example tagging, we use Prague Czech-English Dependency Treebank
+(https://ufal.mff.cuni.cz/pcedt2.0).
+
+Follow the instructions how to download the data on the PCEDT 2.0 web pages.
+
+For a successful run of the example, you should end up with these files in this
+directory:
+
+* `train.forms-cs`
+* `train.tags-cs`
+* `train.tags-cs.subpos`
+* `val.forms-cs`
+* `val.tags-cs.subpos`
+* `val.tags-cs`
diff --git a/examples/align-and-translate/Makefile b/examples/align-and-translate/Makefile
diff --git a/examples/data/download_example_data.sh b/examples/data/download_example_data.sh
diff --git a/examples/data/language_model/.gitignore b/examples/data/language_model/.gitignore
@@ -0,0 +1,4 @@
+LICENSE
+train
+test
+val
diff --git a/examples/data/language_model/download_example_data.sh b/examples/data/language_model/download_example_data.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+for file in train val test LICENSE; do
+    wget http://ufallab.ms.mff.cuni.cz/~helcl/neuralmonkey-example-data/language_model/$file
+done
diff --git a/examples/data/tagging/.gitignore b/examples/data/tagging/.gitignore
@@ -0,0 +1,5 @@
+LICENSE
+train.forms-cs
+train.tags-cs.subpos
+val.forms-cs
+val.tags-cs.subpos
diff --git a/examples/data/tagging/download_example_data.sh b/examples/data/tagging/download_example_data.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+for file in train.forms-cs train.tags-cs.subpos val.forms-cs val.tags-cs.subpos LICENSE; do
+    wget http://ufallab.ms.mff.cuni.cz/~helcl/neuralmonkey-example-data/tagging/$file
+done
diff --git a/examples/data/translation/.gitignore b/examples/data/translation/.gitignore
@@ -0,0 +1,5 @@
+bpe_merges
+train.de
+train.en
+val.de
+val.en
diff --git a/examples/data/translation/download_example_data.sh b/examples/data/translation/download_example_data.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+for file in bpe_merges train.en train.de val.en val.de; do
+    wget http://ufallab.ms.mff.cuni.cz/~helcl/neuralmonkey-example-data/translation/$file
+done
diff --git a/examples/language_model.ini b/examples/language_model.ini
@@ -1,90 +1,61 @@
-; This is an example configuration for training a language model.  It is an
-; INI file with few added syntanctic restrictions.
-;
-; Names in square brackets refer to objects in the program. With the exception
-; of the [main] block, all of them will be instantiated as objects.
-;
-; The field values can be of several types:
-;
-;   * None - interpreted as Python None
-;   * True / False - interpreted as boolean values
-;   * integers
-;   * floating point numbers
-;   * python types (fully defined with module name)
-;   * references to other objects in the configuration, closed in <>
-;   * strings (if it does not match any other pattern)
-;   * tuples of the previous enclosed in brackets
-;   * list of the previous, enclosed in square brackets, comma-separated
-;
-; The vocabularies are handled in a special way. If the vocabularies source is
-; defined in the [main] (a dataset object) a dictionary that maps the language
-; code to the vocabularies is created. Later, if any other block has a field
-; called 'vocabulary', and its value is a known language code, the vocabulary
-; from the dictionary is used. Vocabularies can be also defined as objects
-; in the INI file and can be referenced using the <> notation.
-;
+; This is an example configuration for training a language model. For a more detailed
+; description of an INI example, please refer to the translation.ini file
 
 [main]
-; The main block contains the mandatory fields for running and experiment.
-output=experiments/example-lm-$TIME
-encoders=[]
-decoder=<decoder>
-runner=<runner>
-evaluation=[<perplexity>]
-threads=4
-; The following options are used exclusively for training
-name=language model
-batch_size=5
-epochs=10
+name="language modeling"
+output="out-example-langmodel"
+tf_manager=<tf_manager>
+
 train_dataset=<train_data>
 val_dataset=<val_data>
+test_datasets=[<val_data>]
+
+runners=[<runner>]
 trainer=<trainer>
-minimize=True
-validation_period=100
+evaluation=[]
+
+batch_size=50
+epochs=50
+
+validation_period=500
 logging_period=20
 
-[perplexity]
-class=evaluators.perplexity.Perplexity
+[tf_manager]
+class=tf_manager.TensorFlowManager
+num_sessions=1
+num_threads=4
 
 [train_data]
-; This is definition of the training data object. Notice that language are
-; defined here, because they are used identifiers while preparing vocabularies.
-; Dataset is not a standard class, it treats the __init__ methods arguements as
-; a dictionary, therefore the data series names can be any strings.
 class=dataset.load_dataset_from_files
-s_target=examples/data/train.de
+s_words="examples/data/language_model/train"
 
 [val_data]
-; Validation data, the languages are not necessary here, encoders and decoder
-; acces the data series via the string identifiers defined here.
 class=dataset.load_dataset_from_files
-s_target=examples/data/val.de
+s_words="examples/data/language_model/val"
 
-[decoder_vocabulary]
+[vocabulary]
 class=vocabulary.from_dataset
 datasets=[<train_data>]
-series_ids=[target]
+series_ids=["words"]
 max_size=25000
 
 [decoder]
 class=decoders.decoder.Decoder
-name=decoder
+name="decoder"
 encoders=[]
-rnn_size=256
-embedding_size=256
-use_attention=True
-dropout_keep_prob=0.5
-data_id=target
-vocabulary=<decoder_vocabulary>
+rnn_size=300
+embedding_size=300
+data_id="words"
+vocabulary=<vocabulary>
 max_output_len=50
 
 [trainer]
-; This block just fills the arguments of the trainer __init__ method.
 class=trainers.cross_entropy_trainer.CrossEntropyTrainer
-decoder=<decoder>
-l2_regularization=1.0e-8
+decoders=[<decoder>]
+l2_weight=1.0e-8
+clip_norm=1.0
 
 [runner]
-class=runners.perplexity_runner.PerplexityRunner
+class=runners.runner.GreedyRunner
 decoder=<decoder>
-batch_size=256
+output_series="words"
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,3 +20,4 @@ tests/*.en @@
     tests/*.de
     .idea
     tmp-*
+    out-example-*