Merge pull request #777 from ufal/tf-data-1

Towards TF dataset, part I
ufal · Dec 12, 2018 · 8515d6c · 8515d6c
2 parents b57d4d3 + 4f0f44f
commit 8515d6c
Show file tree

Hide file tree

Showing 34 changed files with 1,308 additions and 1,270 deletions.
diff --git a/neuralmonkey/attention/combination.py b/neuralmonkey/attention/combination.py
@@ -23,6 +23,7 @@
     get_attention_states, get_attention_mask, Attendable)
 from neuralmonkey.attention.namedtuples import HierarchicalLoopState
 from neuralmonkey.checking import assert_shape
+from neuralmonkey.decorators import tensor
 from neuralmonkey.model.model_part import ModelPart
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.tf_utils import get_variable
@@ -49,11 +50,6 @@ def __init__(self,
         self._use_sentinels = use_sentinels
 
         self.att_scope_name = "attention_{}".format(name)
-
-        with self.use_scope():
-            self.attn_v = get_variable(
-                "attn_v", [1, 1, self.attention_state_size],
-                initializer=tf.random_normal_initializer(stddev=0.001))
     # pylint: enable=unused-argument,too-many-arguments
 
     def attention(self,
@@ -64,6 +60,12 @@ def attention(self,
         """Get context vector for given decoder state."""
         raise NotImplementedError("Abstract method")
 
+    @tensor
+    def attn_v(self) -> tf.Tensor:
+        return get_variable(
+            "attn_v", [1, 1, self.attention_state_size],
+            initializer=tf.random_normal_initializer(stddev=0.001))
+
     @property
     def attn_size(self):
         return self.attention_state_size

diff --git a/neuralmonkey/attention/feed_forward.py b/neuralmonkey/attention/feed_forward.py
@@ -13,7 +13,7 @@
     BaseAttention, AttentionLoopState, empty_attention_loop_state,
     get_attention_states, get_attention_mask, Attendable)
 from neuralmonkey.decorators import tensor
-from neuralmonkey.logging import log
+from neuralmonkey.logging import debug
 from neuralmonkey.model.model_part import ModelPart
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.nn.utils import dropout
@@ -42,10 +42,6 @@ def __init__(self,
 
         self._variable_scope.set_initializer(
             tf.random_normal_initializer(stddev=0.001))
-
-        # TODO blessing
-        log("Hidden features: {}".format(self.hidden_features))
-        log("Attention mask: {}".format(self.attention_mask))
     # pylint: enable=too-many-arguments
 
     @tensor
@@ -170,6 +166,19 @@ def attention(self,
         return context, next_loop_state
 
     def initial_loop_state(self) -> AttentionLoopState:
+
+        # Here we need to make sure that the hidden_features and attention_mask
+        # are pre-computed. If this is used in combination with a decoder which
+        # has train and runtime while loops, these tensors need to be created
+        # outside of any of those loops in order to be available to both.
+
+        # Note that we are not breaking lazy loading here because this method
+        # is called from a lazy tensor.
+
+        debug("Pre-computing attention tensors", "bless")
+        debug("Hidden features: {}".format(self.hidden_features), "bless")
+        debug("Hidden mask: {}".format(self.attention_mask), "bless")
+
         return empty_attention_loop_state(
             self.batch_size,
             tf.shape(self.attention_states)[1],

diff --git a/neuralmonkey/config/normalize.py b/neuralmonkey/config/normalize.py
@@ -0,0 +1,179 @@
+"""Module for configuration normalization.
+
+The `[main]` configuration section contains arguments that can be filled with
+different types of values, e.g. `trainer` can be either a single trainer
+object or a list of them. This module provides functions for unifying the
+configuration interface.
+"""
+
+from argparse import Namespace
+from datetime import timedelta
+import re
+import time
+from typing import List, Union, Callable
+
+import numpy as np
+
+from neuralmonkey.dataset import BatchingScheme
+from neuralmonkey.logging import warn
+from neuralmonkey.tf_manager import get_default_tf_manager
+from neuralmonkey.trainers.delayed_update_trainer import DelayedUpdateTrainer
+
+
+def normalize_configuration(cfg: Namespace, train_mode: bool) -> None:
+    """Given a configuration namespace, normalize the values it contains.
+
+    Arguments:
+        cfg: The namespace object returned by `Configuration.make_namespace`
+        train_mode: Boolean flag controlling normalization of parameters only
+            used during training.
+    """
+    if train_mode:
+        _normalize_train_cfg(cfg)
+
+    if cfg.tf_manager is None:
+        cfg.tf_manager = get_default_tf_manager()
+
+    if (cfg.batch_size is None) == (cfg.batching_scheme is None):
+        raise ValueError("You must specify either batch_size or "
+                         "batching_scheme (not both).")
+
+    if cfg.batch_size is not None:
+        assert cfg.batching_scheme is None
+        cfg.batching_scheme = BatchingScheme(batch_size=cfg.batch_size)
+    else:
+        assert cfg.batching_scheme is not None
+        cfg.batch_size = cfg.batching_scheme.batch_size
+
+    if cfg.runners_batch_size is None:
+        cfg.runners_batch_size = cfg.batching_scheme.batch_size
+
+    cfg.runners_batching_scheme = BatchingScheme(
+        batch_size=cfg.runners_batch_size,
+        token_level_batching=cfg.batching_scheme.token_level_batching,
+        use_leftover_buckets=True)
+
+    cfg.evaluation = [(e[0], e[0], e[1]) if len(e) == 2 else e
+                      for e in cfg.evaluation]
+
+    if cfg.evaluation:
+        cfg.main_metric = "{}/{}".format(cfg.evaluation[-1][0],
+                                         cfg.evaluation[-1][-1].name)
+    else:
+        cfg.main_metric = "{}/{}".format(cfg.runners[-1].decoder_data_id,
+                                         cfg.runners[-1].loss_names[0])
+
+        if not cfg.tf_manager.minimize_metric:
+            raise ValueError("minimize_metric must be set to True in "
+                             "TensorFlowManager when using loss as "
+                             "the main metric")
+
+
+def _normalize_train_cfg(cfg: Namespace) -> None:
+    """Given a configuration namespace, normalize the values it contains.
+
+    This function is only executed when training mode has been invoked.
+
+    Arguments:
+        cfg: The namespace object returned by `Configuration.make_namespace`
+    """
+    if not isinstance(cfg.val_dataset, List):
+        cfg.val_datasets = [cfg.val_dataset]
+    else:
+        cfg.val_datasets = cfg.val_dataset
+
+    if not isinstance(cfg.trainer, List):
+        cfg.trainers = [cfg.trainer]
+    else:
+        cfg.trainers = cfg.trainer
+
+    # deal with delayed trainer and logging periods
+    # the correct way if there are more trainers is perhaps to do a
+    # lowest common denominator of their batches_per_update.
+    # But we can also warn because it is a very weird setup.
+
+    delayed_trainers = [t for t in cfg.trainers
+                        if isinstance(t, DelayedUpdateTrainer)]
+
+    denominator = 1
+    if len(cfg.trainers) > 1 and delayed_trainers:
+        warn("Weird setup: using more trainers and one of them is delayed "
+             "update trainer. No-one can vouch for your safety, user!")
+        warn("Using the lowest common denominator of all delayed trainers'"
+             " batches_per_update parameters for logging period")
+        warn("Note that if you are using a multi-task trainer, it is on "
+             "your own risk")
+
+        denominator = np.lcm.reduce([t.batches_per_update
+                                     for t in delayed_trainers])
+    elif delayed_trainers:
+        assert len(cfg.trainers) == 1
+        denominator = cfg.trainers[0].batches_per_update
+
+    cfg.log_timer = _resolve_period(cfg.logging_period, denominator)
+    cfg.val_timer = _resolve_period(cfg.validation_period, denominator)
+
+
+def _resolve_period(period: Union[str, int],
+                    denominator: int) -> Callable[[int, float], bool]:
+    """Convert logging period into a function for logging time checks.
+
+    Logging and validation periods can both be provided either as a number of
+    batches after which to log/validate, or as a time interval between the
+    logs/validation runs.
+
+    This function unifies both representations into a function that decides
+    whether to log/validate based on a given training step and time since the
+    last log/validation.
+
+    Arguments:
+        period: Either a string representing time, or a number representing
+            number of batches.
+        denominator: Only allow logging when the given step (number of batches
+            since the start of the training) is divisible by this value.
+            This is used e.g. when `DelayedUpdateTrainer` is used.
+
+    Returns:
+        A function of the current training step and time since the last logging
+        period that returns a boolean value.
+    """
+    def get_batch_logger(period: int) -> Callable[[int, float], bool]:
+        def is_time(step: int, _: float) -> bool:
+            return step != 0 and step % period == 0
+        return is_time
+
+    def get_time_logger(period: float) -> Callable[[int, float], bool]:
+        def is_time(step: int, last_time: float) -> bool:
+            if step % denominator != 0:
+                return False
+            return last_time + period < time.process_time()
+        return is_time
+
+    if isinstance(period, int):
+        if period % denominator != 0:
+            raise ValueError(
+                "When using delayed update trainer, the logging/validation "
+                "periods must be divisible by batches_per_update.")
+
+        return get_batch_logger(period)
+
+    regex = re.compile(
+        r"((?P<days>\d+?)d)?((?P<hours>\d+?)h)?((?P<minutes>\d+?)m)?"
+        r"((?P<seconds>\d+?)s)?")
+    parts = regex.match(period)
+
+    if not parts:
+        raise ValueError(
+            "Validation or logging period have incorrect format. "
+            "It should be in format: 3h; 5m; 14s")
+
+    time_params = {}
+    for (name, param) in parts.groupdict().items():
+        if param:
+            time_params[name] = int(param)
+
+    delta_seconds = timedelta(**time_params).total_seconds()
+    if delta_seconds <= 0:
+        raise ValueError("Validation or logging period must be bigger than 0")
+
+    return get_time_logger(delta_seconds)
diff --git a/neuralmonkey/decoders/autoregressive.py b/neuralmonkey/decoders/autoregressive.py
@@ -15,7 +15,7 @@
 from neuralmonkey.model.feedable import FeedDict
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
-from neuralmonkey.logging import log, warn
+from neuralmonkey.logging import warn
 from neuralmonkey.model.sequence import EmbeddedSequence
 from neuralmonkey.nn.utils import dropout
 from neuralmonkey.tf_utils import get_variable, get_state_shape_invariants
@@ -134,52 +134,62 @@ def __init__(self,
         ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
                            initializers)
 
-        log("Initializing decoder, name: '{}'".format(name))
-
         self.vocabulary = vocabulary
         self.data_id = data_id
         self.max_output_len = max_output_len
         self.dropout_keep_prob = dropout_keep_prob
-        self.embedding_size = embedding_size
+        self._embedding_size = embedding_size
         self.embeddings_source = embeddings_source
         self.label_smoothing = label_smoothing
         self.tie_embeddings = tie_embeddings
         self.supress_unk = supress_unk
 
-        self.encoder_states = []  # type: List[tf.Tensor]
-        self.encoder_masks = []  # type: List[tf.Tensor]
+        self.encoder_states = lambda: []  # type: Callable[[], List[tf.Tensor]]
+        self.encoder_masks = lambda: []  # type: Callable[[], List[tf.Tensor]]
 
         # Check the values of the parameters (max_output_len, ...)
-        if max_output_len <= 0:
-            raise ValueError("Maximum sequence length must be "
-                             "a positive integer.")
+        if self.max_output_len <= 0:
+            raise ValueError(
+                "Maximum sequence length must be a positive integer.")
 
-        if dropout_keep_prob < 0.0 or dropout_keep_prob > 1.0:
-            raise ValueError("Dropout keep probability must be"
-                             "a real number in the interval [0,1].")
+        if self._embedding_size is not None and self._embedding_size <= 0:
+            raise ValueError("Embedding size must be a positive integer.")
 
-        if self.embedding_size is None and self.embeddings_source is None:
-            raise ValueError("You must specify either embedding size or the "
-                             "embedded sequence from which to reuse the "
-                             "embeddings (e.g. set either 'embedding_size' or "
-                             " 'embeddings_source' parameter)")
+        if self.dropout_keep_prob < 0.0 or self.dropout_keep_prob > 1.0:
+            raise ValueError("Dropout keep probability must be a real number "
+                             "in the interval [0,1].")
+    # pylint: enable=too-many-arguments,too-many-locals
+
+    @property
+    def embedding_size(self) -> int:
+        if self.embeddings_source is None:
+            if self._embedding_size is None:
+                raise ValueError(
+                    "You must specify either embedding size or the embedded "
+                    "sequence from which to reuse the embeddings (e.g. set "
+                    "'embedding_size' or 'embeddings_source' parameter)")
+            return self._embedding_size
 
         if self.embeddings_source is not None:
-            if self.embedding_size is not None:
-                warn("Overriding the embedding_size parameter with the"
-                     " size of the reused embeddings from the encoder.")
+            if self._embedding_size is not None:
+                warn("Overriding the embedding_size parameter with the "
+                     "size of the reused embeddings from the encoder.")
 
-            self.embedding_size = (
-                self.embeddings_source.embedding_matrix.get_shape()[1].value)
+        return self.embeddings_source.embedding_matrix.get_shape()[1].value
 
-        with self.use_scope():
-            self.go_symbols = tf.placeholder(tf.int32, [None], "go_symbols")
+    # pylint: disable=no-self-use
+    @tensor
+    def go_symbols(self) -> tf.Tensor:
+        return tf.placeholder(tf.int32, [None], "go_symbols")
 
-            self.train_inputs = tf.placeholder(
-                tf.int32, [None, None], "train_inputs")
-            self.train_mask = tf.placeholder(
-                tf.float32, [None, None], "train_mask")
-    # pylint: enable=too-many-arguments,too-many-locals
+    @tensor
+    def train_inputs(self) -> tf.Tensor:
+        return tf.placeholder(tf.int32, [None, None], "train_inputs")
+
+    @tensor
+    def train_mask(self) -> tf.Tensor:
+        return tf.placeholder(tf.float32, [None, None], "train_mask")
+    # pylint: enable=no-self-use
 
     @tensor
     def decoding_w(self) -> tf.Variable:

diff --git a/neuralmonkey/decoders/beam_search_decoder.py b/neuralmonkey/decoders/beam_search_decoder.py
@@ -163,13 +163,15 @@ def __init__(self,
         # the beam. We need to access all the inner states of the network in
         # the graph, replace them with beam-size-times copied originals, create
         # the beam search graph, and then replace the inner states back.
+        self._building = False
+
         enc_states = self.parent_decoder.encoder_states
         enc_masks = self.parent_decoder.encoder_masks
 
         setattr(self.parent_decoder, "encoder_states",
-                [self.expand_to_beam(states) for states in enc_states])
+                lambda: [self.expand_to_beam(sts) for sts in enc_states()])
         setattr(self.parent_decoder, "encoder_masks",
-                [self.expand_to_beam(mask) for mask in enc_masks])
+                lambda: [self.expand_to_beam(mask) for mask in enc_masks()])
 
         # Create the beam search symbolic graph.
         with self.use_scope():

diff --git a/neuralmonkey/decoders/ctc_decoder.py b/neuralmonkey/decoders/ctc_decoder.py
@@ -6,7 +6,6 @@
 
 from neuralmonkey.dataset import Dataset
 from neuralmonkey.decorators import tensor
-from neuralmonkey.logging import log
 from neuralmonkey.model.feedable import FeedDict
 from neuralmonkey.model.parameterized import InitializerSpecs
 from neuralmonkey.model.model_part import ModelPart
@@ -47,7 +46,6 @@ def __init__(self,
         self.merge_repeated_targets = merge_repeated_targets
         self.merge_repeated_outputs = merge_repeated_outputs
         self.beam_width = beam_width
-        log("CTC output tensor {}.".format(self.decoded))
     # pylint: enable=too-many-arguments
 
     # pylint: disable=no-self-use