Skip to content

Commit

Permalink
implementation of EWC, gradient_runner and gradient averaging script
Browse files Browse the repository at this point in the history
  • Loading branch information
varisd committed Jul 25, 2018
1 parent 21af0fe commit df97173
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 21 deletions.
2 changes: 1 addition & 1 deletion neuralmonkey/learning_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def _check_savable_dict(data):
return False

supported_type = Union[
List[Dict[str, np.ndarray]],
List[Dict[str, Union[np.ndarray, np.float32]]],
List[List[Dict[str, np.ndarray]]]]

try:
Expand Down
72 changes: 72 additions & 0 deletions neuralmonkey/runners/gradient_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Dict, List, Set, Union

from typeguard import check_argument_types

from neuralmonkey.runners.base_runner import (
BaseRunner, Executable, FeedDict, ExecutionResult, NextExecute)
from neuralmonkey.model.model_part import ModelPart
from neuralmonkey.decoders.autoregressive import AutoregressiveDecoder
from neuralmonkey.decoders.classifier import Classifier
from neuralmonkey.trainers.generic_trainer import GenericTrainer

# pylint: disable=invalid-name
SupportedDecoder = Union[AutoregressiveDecoder, Classifier]
# pylint: enable=invalid-name


class GradientRunExecutable(Executable):

def __init__(self,
all_coders: Set[ModelPart],
fetches: FeedDict) -> None:
self._all_coders = all_coders
self._fetches = fetches

self.result = None

def next_to_execute(self) -> NextExecute:
"""Get the feedables and tensors to run."""
return self._all_coders, self._fetches, []

def collect_results(self, results: List[Dict]) -> None:
assert len(results) == 1

for sess_result in results:
gradient_dict = {}
tensor_names = [t.name for t in self._fetches["gradients"]]
for name, val in zip(tensor_names, sess_result["gradients"]):
gradient_dict[name] = val

self.result = ExecutionResult(
outputs=[gradient_dict],
losses=[],
scalar_summaries=None,
histogram_summaries=None,
image_summaries=None)


class GradientRunner(BaseRunner[SupportedDecoder]):

def __init__(self,
output_series: str,
trainer: GenericTrainer,
decoder: SupportedDecoder) -> None:
check_argument_types()
BaseRunner[AutoregressiveDecoder].__init__(
self, output_series, decoder)

self._gradients = trainer.gradients

# pylint: disable=unused-argument
def get_executable(self,
compute_losses: bool,
summaries: bool,
num_sessions: int) -> GradientRunExecutable:
fetches = {"gradients": [g[1] for g in self._gradients]}

return GradientRunExecutable(self.all_coders, fetches)
# pylint: enable=unused-argument

@property
def loss_names(self) -> List[str]:
return []
7 changes: 3 additions & 4 deletions neuralmonkey/trainers/cross_entropy_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from neuralmonkey.trainers.generic_trainer import (
GenericTrainer, Objective, ObjectiveWeight)
from neuralmonkey.trainers.regularizers import BaseRegularizer


def xent_objective(decoder, weight=None) -> Objective:
Expand All @@ -25,10 +26,9 @@ class CrossEntropyTrainer(GenericTrainer):
def __init__(self,
decoders: List[Any],
decoder_weights: List[ObjectiveWeight] = None,
l1_weight: float = 0.,
l2_weight: float = 0.,
clip_norm: float = None,
optimizer: tf.train.Optimizer = None,
regularizers: List[BaseRegularizer] = None,
var_scopes: List[str] = None,
var_collection: str = None) -> None:
check_argument_types()
Expand All @@ -47,9 +47,8 @@ def __init__(self,
GenericTrainer.__init__(
self,
objectives=objectives,
l1_weight=l1_weight,
l2_weight=l2_weight,
clip_norm=clip_norm,
optimizer=optimizer,
regularizers=regularizers,
var_scopes=var_scopes,
var_collection=var_collection)
39 changes: 23 additions & 16 deletions neuralmonkey/trainers/generic_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from neuralmonkey.model.model_part import ModelPart
from neuralmonkey.runners.base_runner import (
Executable, ExecutionResult, NextExecute)
from neuralmonkey.trainers.regularizers import BaseRegularizer

# pylint: disable=invalid-name
Gradients = List[Tuple[tf.Tensor, tf.Variable]]
Expand Down Expand Up @@ -41,13 +42,17 @@ class GenericTrainer:

def __init__(self,
objectives: List[Objective],
l1_weight: float = 0.0,
l2_weight: float = 0.0,
clip_norm: float = None,
optimizer: tf.train.Optimizer = None,
regularizers: List[BaseRegularizer] = None,
var_scopes: List[str] = None,
var_collection: str = None) -> None:

if regularizers is not None:
self.regularizers = regularizers
else:
self.regularizers = []

if var_collection is None:
var_collection = tf.GraphKeys.TRAINABLE_VARIABLES

Expand Down Expand Up @@ -84,18 +89,15 @@ def __init__(self,
and not v.name.startswith("vgg")
and not v.name.startswith("Inception")
and not v.name.startswith("resnet")]
l1_value = sum(tf.reduce_sum(abs(v)) for v in regularizable)
l1_cost = l1_weight * l1_value if l1_weight > 0 else 0.0

l2_value = sum(tf.reduce_sum(v ** 2) for v in regularizable)
l2_cost = l2_weight * l2_value if l2_weight > 0 else 0.0
reg_values = [reg.value(regularizable)
for reg in self.regularizers]

# unweighted losses for fetching
self.losses = [o.loss for o in objectives] + [l1_value, l2_value]
tf.summary.scalar("train_l1", l1_value,
collections=["summary_train"])
tf.summary.scalar("train_l2", l2_value,
collections=["summary_train"])
self.losses = [o.loss for o in objectives] + reg_values

for reg, reg_value in zip(self.regularizers, reg_values):
tf.summary.scalar(reg.name, reg_value,
collections=["summary_train"])

# log all objectives
for obj in objectives:
Expand All @@ -108,9 +110,13 @@ def __init__(self,
with tf.control_dependencies(update_ops):
with tf.name_scope("gradient_collection"):
differentiable_loss_sum = sum(
(o.weight if o.weight is not None else 1) * o.loss
(o.weight if o.weight is not None else 1.) * o.loss
for o in objectives
if o.gradients is None) + l1_cost + l2_cost
if o.gradients is None)
differentiable_loss_sum += sum(
reg.weight * reg_value
for reg, reg_value in zip(self.regularizers,
reg_values))
implicit_gradients = self._get_gradients(
differentiable_loss_sum)

Expand Down Expand Up @@ -138,10 +144,11 @@ def __init__(self,
self.all_coders = set.union(*(obj.decoder.get_dependencies()
for obj in objectives))

self.gradients = gradients
self.train_op = self.optimizer.apply_gradients(
gradients, global_step=step)
self.gradients, global_step=step)

for grad, var in gradients:
for grad, var in self.gradients:
if grad is not None:
tf.summary.histogram(
"gr_" + var.name,
Expand Down
91 changes: 91 additions & 0 deletions neuralmonkey/trainers/regularizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Variable regularizers.
This module contains classes that can be used as a variable regularizers
during training. All implementation should be derived from the BaseRegularizer
class.
"""
from typing import List

import numpy as np
import tensorflow as tf
from typeguard import check_argument_types

from neuralmonkey.logging import log


# pylint: disable=too-few-public-methods
class BaseRegularizer:
"""Base class for the regularizers."""

def __init__(self,
name: str,
weight: float) -> None:
check_argument_types()
self.name = name
self.weight = weight

def value(self, variables) -> float:
raise NotImplementedError("Abstract method")


class L1Regularizer(BaseRegularizer):

def __init__(self,
name: str = "train_l1",
weight: float = 0.) -> None:
BaseRegularizer.__init__(self, name, weight)

def value(self, variables: List[tf.Tensor]) -> float:
return sum(tf.reduce_sum(abs(v)) for v in variables)


class L2Regularizer(BaseRegularizer):

def __init__(self,
name: str = "train_l2",
weight: float = 0.) -> None:
BaseRegularizer.__init__(self, name, weight)

def value(self, variables: List[tf.Tensor]) -> float:
return sum(tf.reduce_sum(v ** 2) for v in variables)


class EWCRegularizer(BaseRegularizer):
"""Regularizer based on the Elastic Weight Consolidation.
TODO description
"""

def __init__(self,
name: str = "train_ewc",
weight: float = 0.,
gradients_file: str = None,
variables_file: str = None) -> None:
check_argument_types()

BaseRegularizer.__init__(self, name, weight)

if gradients_file is None:
raise ValueError("Missing gradients_file")
if variables_file is None:
raise ValueError("Missing variables_file")

log("Loading initial variables for EWC from {}".format(variables_file))
self.init_vars = tf.contrib.framework.load_checkpoint(variables_file)
log("EWC initial variables loaded")

log("Loading gradient estimates from {}".format(gradients_file))
self.gradients = np.load(gradients_file)
log("Gradient estimates loaded")

def value(self, variables: List[tf.Tensor]) -> float:
ewc_value = 0.0
for var in variables:
var_name = var.name.split(":")[0]
init_var = self.init_vars.get_tensor(var_name)
gradient = self.gradients[var_name]
ewc_value += tf.reduce_sum(tf.multiply(
tf.square(gradient), tf.square(var - init_var)))

return ewc_value
56 changes: 56 additions & 0 deletions scripts/avg_tensors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""Compute the mean over a set of tensors.
The tensors can be spread over multiple npz files. The mean is computed
over the first dimension (supposed to be a batch).
"""

import argparse
import os
import re
import glob

import numpy as np

from neuralmonkey.logging import log as _log


def log(message: str, color: str = "blue") -> None:
_log(message, color)


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--file_prefix", type=str,
help="prefix of the npz files to be averaged")
parser.add_argument("--output_path", type=str,
help="Path to output the averaged checkpoint to.")
args = parser.parse_args()

output_dict = {}
n = 0
for file in glob.glob("{}.*npz".format(args.file_prefix)):
log("Processing {}".format(file))
tensors = np.load(file)

# first dimension must be equal for all tensors (batch)
shapes = [tensors[f].shape for f in tensors.files]
assert all([x[0] == shapes[0][0] for x in shapes])

for varname in tensors.files:
res = np.sum(tensors[varname], 0)
if varname in output_dict:
output_dict[varname] += res
else:
output_dict[varname] = res
n += shapes[0][0]

for name in output_dict:
output_dict[name] /= n

np.savez(args.output_path, **output_dict)


if __name__ == "__main__":
main()

0 comments on commit df97173

Please sign in to comment.