Skip to content

Commit

Permalink
Merge pull request #777 from ufal/tf-data-1
Browse files Browse the repository at this point in the history
Towards TF dataset, part I
  • Loading branch information
jindrahelcl authored Dec 12, 2018
2 parents b57d4d3 + 4f0f44f commit 8515d6c
Show file tree
Hide file tree
Showing 34 changed files with 1,308 additions and 1,270 deletions.
12 changes: 7 additions & 5 deletions neuralmonkey/attention/combination.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
get_attention_states, get_attention_mask, Attendable)
from neuralmonkey.attention.namedtuples import HierarchicalLoopState
from neuralmonkey.checking import assert_shape
from neuralmonkey.decorators import tensor
from neuralmonkey.model.model_part import ModelPart
from neuralmonkey.model.parameterized import InitializerSpecs
from neuralmonkey.tf_utils import get_variable
Expand All @@ -49,11 +50,6 @@ def __init__(self,
self._use_sentinels = use_sentinels

self.att_scope_name = "attention_{}".format(name)

with self.use_scope():
self.attn_v = get_variable(
"attn_v", [1, 1, self.attention_state_size],
initializer=tf.random_normal_initializer(stddev=0.001))
# pylint: enable=unused-argument,too-many-arguments

def attention(self,
Expand All @@ -64,6 +60,12 @@ def attention(self,
"""Get context vector for given decoder state."""
raise NotImplementedError("Abstract method")

@tensor
def attn_v(self) -> tf.Tensor:
return get_variable(
"attn_v", [1, 1, self.attention_state_size],
initializer=tf.random_normal_initializer(stddev=0.001))

@property
def attn_size(self):
return self.attention_state_size
Expand Down
19 changes: 14 additions & 5 deletions neuralmonkey/attention/feed_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
BaseAttention, AttentionLoopState, empty_attention_loop_state,
get_attention_states, get_attention_mask, Attendable)
from neuralmonkey.decorators import tensor
from neuralmonkey.logging import log
from neuralmonkey.logging import debug
from neuralmonkey.model.model_part import ModelPart
from neuralmonkey.model.parameterized import InitializerSpecs
from neuralmonkey.nn.utils import dropout
Expand Down Expand Up @@ -42,10 +42,6 @@ def __init__(self,

self._variable_scope.set_initializer(
tf.random_normal_initializer(stddev=0.001))

# TODO blessing
log("Hidden features: {}".format(self.hidden_features))
log("Attention mask: {}".format(self.attention_mask))
# pylint: enable=too-many-arguments

@tensor
Expand Down Expand Up @@ -170,6 +166,19 @@ def attention(self,
return context, next_loop_state

def initial_loop_state(self) -> AttentionLoopState:

# Here we need to make sure that the hidden_features and attention_mask
# are pre-computed. If this is used in combination with a decoder which
# has train and runtime while loops, these tensors need to be created
# outside of any of those loops in order to be available to both.

# Note that we are not breaking lazy loading here because this method
# is called from a lazy tensor.

debug("Pre-computing attention tensors", "bless")
debug("Hidden features: {}".format(self.hidden_features), "bless")
debug("Hidden mask: {}".format(self.attention_mask), "bless")

return empty_attention_loop_state(
self.batch_size,
tf.shape(self.attention_states)[1],
Expand Down
179 changes: 179 additions & 0 deletions neuralmonkey/config/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""Module for configuration normalization.
The `[main]` configuration section contains arguments that can be filled with
different types of values, e.g. `trainer` can be either a single trainer
object or a list of them. This module provides functions for unifying the
configuration interface.
"""

from argparse import Namespace
from datetime import timedelta
import re
import time
from typing import List, Union, Callable

import numpy as np

from neuralmonkey.dataset import BatchingScheme
from neuralmonkey.logging import warn
from neuralmonkey.tf_manager import get_default_tf_manager
from neuralmonkey.trainers.delayed_update_trainer import DelayedUpdateTrainer


def normalize_configuration(cfg: Namespace, train_mode: bool) -> None:
"""Given a configuration namespace, normalize the values it contains.
Arguments:
cfg: The namespace object returned by `Configuration.make_namespace`
train_mode: Boolean flag controlling normalization of parameters only
used during training.
"""
if train_mode:
_normalize_train_cfg(cfg)

if cfg.tf_manager is None:
cfg.tf_manager = get_default_tf_manager()

if (cfg.batch_size is None) == (cfg.batching_scheme is None):
raise ValueError("You must specify either batch_size or "
"batching_scheme (not both).")

if cfg.batch_size is not None:
assert cfg.batching_scheme is None
cfg.batching_scheme = BatchingScheme(batch_size=cfg.batch_size)
else:
assert cfg.batching_scheme is not None
cfg.batch_size = cfg.batching_scheme.batch_size

if cfg.runners_batch_size is None:
cfg.runners_batch_size = cfg.batching_scheme.batch_size

cfg.runners_batching_scheme = BatchingScheme(
batch_size=cfg.runners_batch_size,
token_level_batching=cfg.batching_scheme.token_level_batching,
use_leftover_buckets=True)

cfg.evaluation = [(e[0], e[0], e[1]) if len(e) == 2 else e
for e in cfg.evaluation]

if cfg.evaluation:
cfg.main_metric = "{}/{}".format(cfg.evaluation[-1][0],
cfg.evaluation[-1][-1].name)
else:
cfg.main_metric = "{}/{}".format(cfg.runners[-1].decoder_data_id,
cfg.runners[-1].loss_names[0])

if not cfg.tf_manager.minimize_metric:
raise ValueError("minimize_metric must be set to True in "
"TensorFlowManager when using loss as "
"the main metric")


def _normalize_train_cfg(cfg: Namespace) -> None:
"""Given a configuration namespace, normalize the values it contains.
This function is only executed when training mode has been invoked.
Arguments:
cfg: The namespace object returned by `Configuration.make_namespace`
"""
if not isinstance(cfg.val_dataset, List):
cfg.val_datasets = [cfg.val_dataset]
else:
cfg.val_datasets = cfg.val_dataset

if not isinstance(cfg.trainer, List):
cfg.trainers = [cfg.trainer]
else:
cfg.trainers = cfg.trainer

# deal with delayed trainer and logging periods
# the correct way if there are more trainers is perhaps to do a
# lowest common denominator of their batches_per_update.
# But we can also warn because it is a very weird setup.

delayed_trainers = [t for t in cfg.trainers
if isinstance(t, DelayedUpdateTrainer)]

denominator = 1
if len(cfg.trainers) > 1 and delayed_trainers:
warn("Weird setup: using more trainers and one of them is delayed "
"update trainer. No-one can vouch for your safety, user!")
warn("Using the lowest common denominator of all delayed trainers'"
" batches_per_update parameters for logging period")
warn("Note that if you are using a multi-task trainer, it is on "
"your own risk")

denominator = np.lcm.reduce([t.batches_per_update
for t in delayed_trainers])
elif delayed_trainers:
assert len(cfg.trainers) == 1
denominator = cfg.trainers[0].batches_per_update

cfg.log_timer = _resolve_period(cfg.logging_period, denominator)
cfg.val_timer = _resolve_period(cfg.validation_period, denominator)


def _resolve_period(period: Union[str, int],
denominator: int) -> Callable[[int, float], bool]:
"""Convert logging period into a function for logging time checks.
Logging and validation periods can both be provided either as a number of
batches after which to log/validate, or as a time interval between the
logs/validation runs.
This function unifies both representations into a function that decides
whether to log/validate based on a given training step and time since the
last log/validation.
Arguments:
period: Either a string representing time, or a number representing
number of batches.
denominator: Only allow logging when the given step (number of batches
since the start of the training) is divisible by this value.
This is used e.g. when `DelayedUpdateTrainer` is used.
Returns:
A function of the current training step and time since the last logging
period that returns a boolean value.
"""
def get_batch_logger(period: int) -> Callable[[int, float], bool]:
def is_time(step: int, _: float) -> bool:
return step != 0 and step % period == 0
return is_time

def get_time_logger(period: float) -> Callable[[int, float], bool]:
def is_time(step: int, last_time: float) -> bool:
if step % denominator != 0:
return False
return last_time + period < time.process_time()
return is_time

if isinstance(period, int):
if period % denominator != 0:
raise ValueError(
"When using delayed update trainer, the logging/validation "
"periods must be divisible by batches_per_update.")

return get_batch_logger(period)

regex = re.compile(
r"((?P<days>\d+?)d)?((?P<hours>\d+?)h)?((?P<minutes>\d+?)m)?"
r"((?P<seconds>\d+?)s)?")
parts = regex.match(period)

if not parts:
raise ValueError(
"Validation or logging period have incorrect format. "
"It should be in format: 3h; 5m; 14s")

time_params = {}
for (name, param) in parts.groupdict().items():
if param:
time_params[name] = int(param)

delta_seconds = timedelta(**time_params).total_seconds()
if delta_seconds <= 0:
raise ValueError("Validation or logging period must be bigger than 0")

return get_time_logger(delta_seconds)
68 changes: 39 additions & 29 deletions neuralmonkey/decoders/autoregressive.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from neuralmonkey.model.feedable import FeedDict
from neuralmonkey.model.parameterized import InitializerSpecs
from neuralmonkey.model.model_part import ModelPart
from neuralmonkey.logging import log, warn
from neuralmonkey.logging import warn
from neuralmonkey.model.sequence import EmbeddedSequence
from neuralmonkey.nn.utils import dropout
from neuralmonkey.tf_utils import get_variable, get_state_shape_invariants
Expand Down Expand Up @@ -134,52 +134,62 @@ def __init__(self,
ModelPart.__init__(self, name, reuse, save_checkpoint, load_checkpoint,
initializers)

log("Initializing decoder, name: '{}'".format(name))

self.vocabulary = vocabulary
self.data_id = data_id
self.max_output_len = max_output_len
self.dropout_keep_prob = dropout_keep_prob
self.embedding_size = embedding_size
self._embedding_size = embedding_size
self.embeddings_source = embeddings_source
self.label_smoothing = label_smoothing
self.tie_embeddings = tie_embeddings
self.supress_unk = supress_unk

self.encoder_states = [] # type: List[tf.Tensor]
self.encoder_masks = [] # type: List[tf.Tensor]
self.encoder_states = lambda: [] # type: Callable[[], List[tf.Tensor]]
self.encoder_masks = lambda: [] # type: Callable[[], List[tf.Tensor]]

# Check the values of the parameters (max_output_len, ...)
if max_output_len <= 0:
raise ValueError("Maximum sequence length must be "
"a positive integer.")
if self.max_output_len <= 0:
raise ValueError(
"Maximum sequence length must be a positive integer.")

if dropout_keep_prob < 0.0 or dropout_keep_prob > 1.0:
raise ValueError("Dropout keep probability must be"
"a real number in the interval [0,1].")
if self._embedding_size is not None and self._embedding_size <= 0:
raise ValueError("Embedding size must be a positive integer.")

if self.embedding_size is None and self.embeddings_source is None:
raise ValueError("You must specify either embedding size or the "
"embedded sequence from which to reuse the "
"embeddings (e.g. set either 'embedding_size' or "
" 'embeddings_source' parameter)")
if self.dropout_keep_prob < 0.0 or self.dropout_keep_prob > 1.0:
raise ValueError("Dropout keep probability must be a real number "
"in the interval [0,1].")
# pylint: enable=too-many-arguments,too-many-locals

@property
def embedding_size(self) -> int:
if self.embeddings_source is None:
if self._embedding_size is None:
raise ValueError(
"You must specify either embedding size or the embedded "
"sequence from which to reuse the embeddings (e.g. set "
"'embedding_size' or 'embeddings_source' parameter)")
return self._embedding_size

if self.embeddings_source is not None:
if self.embedding_size is not None:
warn("Overriding the embedding_size parameter with the"
" size of the reused embeddings from the encoder.")
if self._embedding_size is not None:
warn("Overriding the embedding_size parameter with the "
"size of the reused embeddings from the encoder.")

self.embedding_size = (
self.embeddings_source.embedding_matrix.get_shape()[1].value)
return self.embeddings_source.embedding_matrix.get_shape()[1].value

with self.use_scope():
self.go_symbols = tf.placeholder(tf.int32, [None], "go_symbols")
# pylint: disable=no-self-use
@tensor
def go_symbols(self) -> tf.Tensor:
return tf.placeholder(tf.int32, [None], "go_symbols")

self.train_inputs = tf.placeholder(
tf.int32, [None, None], "train_inputs")
self.train_mask = tf.placeholder(
tf.float32, [None, None], "train_mask")
# pylint: enable=too-many-arguments,too-many-locals
@tensor
def train_inputs(self) -> tf.Tensor:
return tf.placeholder(tf.int32, [None, None], "train_inputs")

@tensor
def train_mask(self) -> tf.Tensor:
return tf.placeholder(tf.float32, [None, None], "train_mask")
# pylint: enable=no-self-use

@tensor
def decoding_w(self) -> tf.Variable:
Expand Down
6 changes: 4 additions & 2 deletions neuralmonkey/decoders/beam_search_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,15 @@ def __init__(self,
# the beam. We need to access all the inner states of the network in
# the graph, replace them with beam-size-times copied originals, create
# the beam search graph, and then replace the inner states back.
self._building = False

enc_states = self.parent_decoder.encoder_states
enc_masks = self.parent_decoder.encoder_masks

setattr(self.parent_decoder, "encoder_states",
[self.expand_to_beam(states) for states in enc_states])
lambda: [self.expand_to_beam(sts) for sts in enc_states()])
setattr(self.parent_decoder, "encoder_masks",
[self.expand_to_beam(mask) for mask in enc_masks])
lambda: [self.expand_to_beam(mask) for mask in enc_masks()])

# Create the beam search symbolic graph.
with self.use_scope():
Expand Down
2 changes: 0 additions & 2 deletions neuralmonkey/decoders/ctc_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from neuralmonkey.dataset import Dataset
from neuralmonkey.decorators import tensor
from neuralmonkey.logging import log
from neuralmonkey.model.feedable import FeedDict
from neuralmonkey.model.parameterized import InitializerSpecs
from neuralmonkey.model.model_part import ModelPart
Expand Down Expand Up @@ -47,7 +46,6 @@ def __init__(self,
self.merge_repeated_targets = merge_repeated_targets
self.merge_repeated_outputs = merge_repeated_outputs
self.beam_width = beam_width
log("CTC output tensor {}.".format(self.decoded))
# pylint: enable=too-many-arguments

# pylint: disable=no-self-use
Expand Down
Loading

0 comments on commit 8515d6c

Please sign in to comment.