From 083d62f1ec0dae2cb8a2d27c77549e2cd2c8af43 Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Wed, 19 Feb 2020 14:24:24 -0500 Subject: [PATCH 1/3] Add ELMo modules --- requirements.txt | 1 + setup.py | 1 + .../data/tokenizers/elmo_tokenizer_utils.py | 136 ++ .../tokenizers/elmo_tokenizer_utils_test.py | 105 + texar/torch/modules/encoders/__init__.py | 1 + texar/torch/modules/encoders/elmo_encoder.py | 323 +++ .../modules/encoders/elmo_encoder_test.py | 146 ++ texar/torch/modules/pretrained/__init__.py | 1 + texar/torch/modules/pretrained/elmo.py | 104 + texar/torch/modules/pretrained/elmo_test.py | 71 + texar/torch/modules/pretrained/elmo_utils.py | 2166 +++++++++++++++++ .../modules/pretrained/elmo_utils_test.py | 882 +++++++ texar/torch/utils/test.py | 3 + 13 files changed, 3940 insertions(+) create mode 100644 texar/torch/data/tokenizers/elmo_tokenizer_utils.py create mode 100644 texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py create mode 100644 texar/torch/modules/encoders/elmo_encoder.py create mode 100644 texar/torch/modules/encoders/elmo_encoder_test.py create mode 100644 texar/torch/modules/pretrained/elmo.py create mode 100644 texar/torch/modules/pretrained/elmo_test.py create mode 100644 texar/torch/modules/pretrained/elmo_utils.py create mode 100644 texar/torch/modules/pretrained/elmo_utils_test.py diff --git a/requirements.txt b/requirements.txt index efdba2f84..22f7f8cdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy >= 1.15.4 mypy_extensions >= 0.4.1 regex >= 2018.01.10 sentencepiece >= 0.1.8 +h5py >= 2.10.0 diff --git a/setup.py b/setup.py index 85fef9cb4..3b86d933c 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ install_requires=[ 'regex>=2018.01.10', 'numpy', + 'h5py>=2.10.0', 'requests', 'funcsigs', 'sentencepiece>=0.1.8', diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py new file mode 100644 index 000000000..ea454d0d8 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py @@ -0,0 +1,136 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py` +""" +from typing import Dict, List, Optional + +import torch + +from torch.nn.utils.rnn import pad_sequence + + +__all__ = [ + "ELMoCharacterMapper", + "batch_to_ids", +] + + +def _make_bos_eos( + character: int, + padding_character: int, + beginning_of_word_character: int, + end_of_word_character: int, + max_word_length: int, +): + char_ids = [padding_character] * max_word_length + char_ids[0] = beginning_of_word_character + char_ids[1] = character + char_ids[2] = end_of_word_character + return char_ids + + +class ELMoCharacterMapper: + r"""Maps individual tokens to sequences of character ids, compatible with + ELMo. To be consistent with previously trained models, we include it here as + special of existing character indexers. + + We allow to add optional additional special tokens with designated + character ids with `tokens_to_add`. + """ + + max_word_length = 50 + + # char ids 0-255 come from utf-8 encoding bytes + # assign 256-300 to special chars + beginning_of_sentence_character = 256 # + end_of_sentence_character = 257 # + beginning_of_word_character = 258 # + end_of_word_character = 259 # + padding_character = 260 # + + beginning_of_sentence_characters = _make_bos_eos( + beginning_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + end_of_sentence_characters = _make_bos_eos( + end_of_sentence_character, + padding_character, + beginning_of_word_character, + end_of_word_character, + max_word_length, + ) + + bos_token = "" + eos_token = "" + + def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None: + self.tokens_to_add = tokens_to_add or {} + + def convert_word_to_char_ids(self, word: str) -> List[int]: + if word in self.tokens_to_add: + char_ids = ([ELMoCharacterMapper.padding_character] * + ELMoCharacterMapper.max_word_length) + char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + char_ids[1] = self.tokens_to_add[word] + char_ids[2] = ELMoCharacterMapper.end_of_word_character + elif word == ELMoCharacterMapper.bos_token: + char_ids = ELMoCharacterMapper.beginning_of_sentence_characters + elif word == ELMoCharacterMapper.eos_token: + char_ids = ELMoCharacterMapper.end_of_sentence_characters + else: + word_encoded = word.encode( + "utf-8", "ignore")[: (ELMoCharacterMapper.max_word_length - 2)] + char_ids = ([ELMoCharacterMapper.padding_character] * + ELMoCharacterMapper.max_word_length) + char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + for k, chr_id in enumerate(word_encoded, start=1): + char_ids[k] = chr_id + char_ids[len(word_encoded) + 1] = \ + ELMoCharacterMapper.end_of_word_character + + # +1 one for masking + return [c + 1 for c in char_ids] + + def __eq__(self, other) -> bool: + if isinstance(self, other.__class__): + return self.__dict__ == other.__dict__ + return NotImplemented + + +def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: + r"""Converts a batch of tokenized sentences to a tensor representing the + sentences with encoded characters (len(batch), max sentence length, + max word length). + + Args: + batch: A list of tokenized sentences. + + Returns: + A tensor of padded character ids. + """ + res = [] + mapper = ELMoCharacterMapper() + for sentence in batch: + character_ids = [mapper.convert_word_to_char_ids(token) + for token in sentence] + res.append(torch.tensor(character_ids)) + + return pad_sequence(res, batch_first=True) diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py new file mode 100644 index 000000000..f8dac6703 --- /dev/null +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py @@ -0,0 +1,105 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for pre-trained ELMo tokenizer. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + ELMoCharacterMapper, batch_to_ids) + + +class ELMoTokenizerUtilsTest(unittest.TestCase): + + def test_bos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + expected_indices = [ + 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_eos_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids('') + expected_indices = [ + 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_unicode_to_char_ids(self): + mapper = ELMoCharacterMapper() + indices = mapper.convert_word_to_char_ids(chr(256) + "t") + expected_indices = [ + 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_additional_tokens(self): + mapper = ELMoCharacterMapper(tokens_to_add={"": 1}) + indices = mapper.convert_word_to_char_ids("") + expected_indices = [ + 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + ] + self.assertEqual(indices, expected_indices) + + def test_batch_to_ids(self): + sentences = [['First', 'sentence', '.'], ['Another', '.']] + indices = batch_to_ids(sentences) + expected_indices = [[[ + 259, 71, 106, 115, 116, 117, 260, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ + 259, 116, 102, 111, 117, 102, 111, 100, 102, 260, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ + 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261]], + [[259, 66, 111, 112, 117, 105, 102, 115, 260, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], + [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, + 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]]] + self.assertEqual(indices.tolist(), expected_indices) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/encoders/__init__.py b/texar/torch/modules/encoders/__init__.py index ce69fd985..1031dab5c 100644 --- a/texar/torch/modules/encoders/__init__.py +++ b/texar/torch/modules/encoders/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.encoders.bert_encoder import * from texar.torch.modules.encoders.conv_encoders import * +from texar.torch.modules.encoders.elmo_encoder import * from texar.torch.modules.encoders.encoder_base import * from texar.torch.modules.encoders.gpt2_encoder import * from texar.torch.modules.encoders.multihead_attention import * diff --git a/texar/torch/modules/encoders/elmo_encoder.py b/texar/torch/modules/encoders/elmo_encoder.py new file mode 100644 index 000000000..2011d3dbc --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder.py @@ -0,0 +1,323 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +ELMo encoder. +""" +import json +import os +import tempfile +import warnings + +from typing import Any, Dict, List, Optional, Union + +import torch + +from torch.nn.modules import Dropout + +from texar.torch.modules.encoders.encoder_base import EncoderBase +from texar.torch.modules.pretrained.elmo import PretrainedELMoMixin +from texar.torch.modules.pretrained.elmo_utils import ( + _ElmoBiLm, ScalarMix, remove_sentence_boundaries) + +__all__ = [ + "ELMoEncoder", +] + + +class ELMoEncoder(EncoderBase, PretrainedELMoMixin): + r"""ELMo model for encoding sequences. Please see + :class:`~texar.torch.modules.PretrainedELMoMixin` for a brief description + of ELMo. + + Args: + pretrained_model_name (optional): a `str`, the name + of pre-trained model (e.g., ``elmo-small``). Please refer to + :class:`~texar.torch.modules.PretrainedELMoMixin` for + all supported models. + If `None`, the model name in :attr:`hparams` is used. + cache_dir (optional): the path to a folder in which the + pre-trained models will be cached. If `None` (default), + a default directory (``texar_data`` folder under user's home + directory) will be used. + hparams (dict or HParams, optional): Hyperparameters. Missing + hyperparameter will be set to default values. See + :meth:`default_hparams` for the hyperparameter structure + and default values. + """ + def __init__(self, + pretrained_model_name: Optional[str] = None, + cache_dir: Optional[str] = None, + hparams=None): + super().__init__(hparams=hparams) + + self.load_pretrained_config(pretrained_model_name, cache_dir) + + options_file = None + weight_file = None + tmp_dir = tempfile.TemporaryDirectory() + if self.pretrained_model_dir is not None: + info = list(os.walk(self.pretrained_model_dir)) + root, _, files = info[0] + for file in files: + if file.endswith('options.json'): + options_file = os.path.join(root, file) + if file.endswith('weights.hdf5'): + weight_file = os.path.join(root, file) + else: + with open(os.path.join(tmp_dir.name, 'options.json'), "w") as fp: + json.dump(self.hparams.encoder.todict(), fp) + options_file = os.path.join(tmp_dir.name, 'options.json') + + assert options_file is not None + self._elmo_lstm = _ElmoBiLm( + options_file, + weight_file, # type: ignore + requires_grad=self.hparams.requires_grad, + vocab_to_cache=self.hparams.vocab_to_cache, + ) + tmp_dir.cleanup() + + self._has_cached_vocab = self.hparams.vocab_to_cache is not None + self._keep_sentence_boundaries = self.hparams.keep_sentence_boundaries + self._dropout = Dropout(p=self.hparams.dropout) + self._scalar_mixes: Any = [] + for k in range(self.hparams.num_output_representations): + scalar_mix = ScalarMix( + self._elmo_lstm.num_layers, + do_layer_norm=self.hparams.do_layer_norm, + initial_scalar_parameters=self.hparams.scalar_mix_parameters, + trainable=self.hparams.scalar_mix_parameters is None, + ) + self.add_module("scalar_mix_{}".format(k), scalar_mix) + self._scalar_mixes.append(scalar_mix) + + @staticmethod + def default_hparams(): + r"""Returns a dictionary of hyperparameters with default values. + + * The encoder arch is determined by the constructor argument + :attr:`pretrained_model_name` if it's specified. In this case, + `hparams` are ignored. + * Otherwise, the encoder arch is determined by + `hparams['pretrained_model_name']` if it's specified. All other + configurations in `hparams` are ignored. + * If the above two are `None`, the encoder arch is defined by the + configurations in `hparams` and weights are randomly initialized. + + .. code-block:: python + + { + "pretrained_model_name": "elmo-small", + "encoder": { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], + [5, 256], [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + "num_output_representations": 2, + "requires_grad": False, + "do_layer_norm": False, + "dropout": 0.5, + "vocab_to_cache": None, + "keep_sentence_boundaries": False, + "scalar_mix_parameters": None, + "name": "elmo_encoder", + } + + Here: + + The default parameters are values for elmo-small model. + + `"pretrained_model_name"`: str or None + The name of the pre-trained ELMo model. If None, the model + will be randomly initialized. + + `"encoder"`: dict + Hyperparameters for ELMo encoder. + + `"num_output_representations"`: int + The number of ELMo representation to output with different linear + weighted combination of the 3 layers (i.e., character-convnet + output, 1st lstm output, 2nd lstm output). + + `"requires_grad"`: bool + If True, compute gradient of ELMo parameters for fine tuning. + + `"do_layer_norm"`: bool + Should we apply layer normalization (passed to `ScalarMix`)? + + `"dropout"`: float + The dropout to be applied to the ELMo representations. + + `"vocab_to_cache"`: List[str] + A list of words to pre-compute and cache character convolutions + for. If you use this option, Elmo expects that you pass word + indices of shape (batch_size, timesteps) to forward, instead + of character indices. If you use this option and pass a word which + wasn't pre-cached, this will break. + + `"keep_sentence_boundaries"`: bool + If True, the representation of the sentence boundary tokens are + not removed. + + `"scalar_mix_parameters"`: List[float] + If not `None`, use these scalar mix parameters to weight the + representations produced by different layers. These mixing weights + are not updated during training. The mixing weights here should be + the unnormalized (i.e., pre-softmax) weights. So, if you wanted to + use only the 1st layer of a 2-layer ELMo, you can set this to + [-9e10, 1, -9e10 ]. + + `"name"`: str + Name of the module. + """ + return { + 'pretrained_model_name': 'elmo-small', + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + 'num_output_representations': 2, + 'requires_grad': False, + 'do_layer_norm': False, + 'dropout': 0.5, + 'vocab_to_cache': None, + 'keep_sentence_boundaries': False, + 'scalar_mix_parameters': None, + 'name': 'elmo_encoder', + '@no_typecheck': ['pretrained_model_name'] + } + + def forward(self, # type: ignore + inputs: torch.Tensor, + word_inputs: Optional[torch.Tensor] = None) -> \ + Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + Args: + inputs: Shape `[batch_size, max_time, 50]` of character ids + representing the current batch. + word_inputs: If you passed a cached vocab, you can in addition pass + a tensor of shape `[batch_size, max_time]`, which represent + word ids which have been pre-cached. + + Returns: + A Dict with keys: + + - :attr:`elmo_representations`: A `num_output_representations` list + of ELMo representations for the input sequence. Each + representation is shape `[batch_size, max_time, embedding_dim]` + + - :attr:`mask`: Shape `(batch_size, timesteps)` long tensor + with sequence mask. + """ + # reshape the input if needed + original_shape = inputs.size() + if len(original_shape) > 3: + timesteps, num_characters = original_shape[-2:] + reshaped_inputs = inputs.view(-1, timesteps, num_characters) + else: + reshaped_inputs = inputs + + if word_inputs is not None: + original_word_size = word_inputs.size() + if self._has_cached_vocab and len(original_word_size) > 2: + reshaped_word_inputs = word_inputs.view(-1, + original_word_size[-1]) + elif not self._has_cached_vocab: + warnings.warn( + "Word inputs were passed to ELMo but it does not have a " + "cached vocab.") + reshaped_word_inputs = None # type: ignore + else: + reshaped_word_inputs = word_inputs + else: + reshaped_word_inputs = word_inputs # type: ignore + + # run the biLM + bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs) + layer_activations = bilm_output["activations"] + mask_with_bos_eos = bilm_output["mask"] + + # compute the elmo representations + representations = [] + for i in range(len(self._scalar_mixes)): + scalar_mix = getattr(self, "scalar_mix_{}".format(i)) + representation_with_bos_eos = scalar_mix(layer_activations, + mask_with_bos_eos) + if self._keep_sentence_boundaries: + processed_representation = representation_with_bos_eos + processed_mask = mask_with_bos_eos + else: + representation_without_bos_eos, mask_without_bos_eos = \ + remove_sentence_boundaries( + representation_with_bos_eos, mask_with_bos_eos) + processed_representation = representation_without_bos_eos + processed_mask = mask_without_bos_eos + representations.append(self._dropout(processed_representation)) + + # reshape if necessary + if word_inputs is not None and len(original_word_size) > 2: + mask = processed_mask.view(original_word_size) + elmo_representations = [ + representation.view(original_word_size + (-1,)) + for representation in representations + ] + elif len(original_shape) > 3: + mask = processed_mask.view(original_shape[:-1]) + elmo_representations = [ + representation.view(original_shape[:-1] + (-1,)) + for representation in representations + ] + else: + mask = processed_mask + elmo_representations = representations + + return {"elmo_representations": elmo_representations, "mask": mask} + + @property + def output_size(self): + return self._elmo_lstm.get_output_dim() diff --git a/texar/torch/modules/encoders/elmo_encoder_test.py b/texar/torch/modules/encoders/elmo_encoder_test.py new file mode 100644 index 000000000..04a34b359 --- /dev/null +++ b/texar/torch/modules/encoders/elmo_encoder_test.py @@ -0,0 +1,146 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo Encoder. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` +""" + +import unittest + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.modules.encoders.elmo_encoder import ELMoEncoder +from texar.torch.utils.test import pretrained_test + + +class ELMoEncoderTest(unittest.TestCase): + r"""Tests :class:`~texar.torch.modules.ELMoEncoder` class. + """ + + @pretrained_test + def test_model_loading(self): + r"""Tests model loading functionality.""" + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + for pretrained_model_name in ELMoEncoder.available_checkpoints(): + encoder = ELMoEncoder(pretrained_model_name=pretrained_model_name) + _ = encoder(character_ids) + + def test_encode(self): + r"""Tests encoding. + """ + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + } + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + assert list(elmo_representations[0].size()) == [2, 7, 32] + assert list(elmo_representations[1].size()) == [2, 7, 32] + assert list(mask.size()) == [2, 7] + + def test_elmo_keep_sentence_boundaries(self): + hparams = { + "pretrained_model_name": None, + 'encoder': { + "lstm": { + "cell_clip": 3, + "use_skip_connections": True, + "n_layers": 2, + "proj_clip": 3, + "projection_dim": 16, + "dim": 64 + }, + "char_cnn": { + "embedding": { + "dim": 4 + }, + "filters": [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + "n_highway": 2, + "n_characters": 262, + "max_characters_per_token": 50, + "activation": "relu" + } + }, + 'dropout': 0.0, + 'keep_sentence_boundaries': True, + } + encoder = ELMoEncoder(hparams=hparams) + + sentences = [ + ["The", "sentence", "."], + ["ELMo", "helps", "disambiguate", "ELMo", "from", "Elmo", "."], + ] + character_ids = batch_to_ids(sentences) + output = encoder(character_ids) + elmo_representations = output["elmo_representations"] + mask = output["mask"] + + assert len(elmo_representations) == 2 + # Add 2 to the lengths because we're keeping the start and end of + # sentence tokens. + assert list(elmo_representations[0].size()) == [2, 7 + 2, 32] + assert list(elmo_representations[1].size()) == [2, 7 + 2, 32] + assert list(mask.size()) == [2, 7 + 2] + + @pretrained_test + def test_trainable_variables(self): + encoder = ELMoEncoder() + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is False for grad in elmo_grads) + + encoder = ELMoEncoder(hparams={'requires_grad': True}) + elmo_grads = [ + param.requires_grad for param in encoder._elmo_lstm.parameters() + ] + assert all(grad is True for grad in elmo_grads) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/__init__.py b/texar/torch/modules/pretrained/__init__.py index 1f06a87a9..1e0ae19d3 100644 --- a/texar/torch/modules/pretrained/__init__.py +++ b/texar/torch/modules/pretrained/__init__.py @@ -17,6 +17,7 @@ from texar.torch.modules.pretrained.pretrained_base import * from texar.torch.modules.pretrained.bert import * +from texar.torch.modules.pretrained.elmo import * from texar.torch.modules.pretrained.gpt2 import * from texar.torch.modules.pretrained.roberta import * from texar.torch.modules.pretrained.xlnet import * diff --git a/texar/torch/modules/pretrained/elmo.py b/texar/torch/modules/pretrained/elmo.py new file mode 100644 index 000000000..ef616d1b5 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo.py @@ -0,0 +1,104 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. +""" + +import json +import os + +from abc import ABC +from typing import Any, Dict + +from texar.torch.modules.pretrained.pretrained_base import PretrainedMixin + +__all__ = [ + "PretrainedELMoMixin", +] + +_ELMo_PATH = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/" + + +class PretrainedELMoMixin(PretrainedMixin, ABC): + r"""A mixin class to support loading pre-trained checkpoints for modules + that implement the ELMo model. + + The ELMo model was proposed in + `Deep contextualized word representations`_ + by `Peters et al.` from Allen Institute for Artificial Intelligence. It is + a deep bidirectional language model (biLM), which is pre-trained on a + large text corpus. + + The available ELMo models are as follows: + + * ``elmo-small``: 13.6M parameters, trained on 800M tokens. + * ``elmo-medium``: 28.0M parameters, trained on 800M tokens. + * ``elmo-original``: 93.6M parameters, trained on 800M tokens. + * ``elmo-original-5.5b``: 93.6M parameters, trained on 5.5B tokens. + + We provide the following ELMo classes: + + * :class:`~texar.torch.modules.ELMoEncoder` for text encoding. + + .. _`Deep contextualized word representations`: + https://arxiv.org/abs/1802.05365 + """ + _MODEL_NAME = "ELMo" + _MODEL2URL = { + 'elmo-small': [ + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x1024_128_2048cnn_1xhighway/' + 'elmo_2x1024_128_2048cnn_1xhighway_options.json', + ], + 'elmo-medium': [ + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5', + _ELMo_PATH + '2x2048_256_2048cnn_1xhighway/' + 'elmo_2x2048_256_2048cnn_1xhighway_options.json', + ], + 'elmo-original': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway/' + 'elmo_2x4096_512_2048cnn_2xhighway_options.json', + ], + 'elmo-original-5.5b': [ + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5', + _ELMo_PATH + '2x4096_512_2048cnn_2xhighway_5.5B/' + 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', + ], + } + + @classmethod + def _transform_config(cls, pretrained_model_name: str, + cache_dir: str) -> Dict[str, Any]: + info = list(os.walk(cache_dir)) + root, _, files = info[0] + config_path = None + for file in files: + if file.endswith('options.json'): + config_path = os.path.join(root, file) + if config_path is None: + raise ValueError(f"Cannot find the config file in {cache_dir}") + + with open(config_path) as f: + config_elmo = json.loads(f.read()) + + return {'encoder': config_elmo} + + def _init_from_checkpoint(self, pretrained_model_name: str, + cache_dir: str, **kwargs): + return diff --git a/texar/torch/modules/pretrained/elmo_test.py b/texar/torch/modules/pretrained/elmo_test.py new file mode 100644 index 000000000..d31bb1f5a --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_test.py @@ -0,0 +1,71 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ELMo utils. +""" + +import os +import unittest + +from texar.torch.modules.pretrained.elmo import * +from texar.torch.utils.test import pretrained_test + + +class ELMoUtilsTest(unittest.TestCase): + r"""Tests ELMo Utils. + """ + + @pretrained_test + def test_load_pretrained_elmo_AND_transform_elmo_to_texar_config(self): + pretrained_model_dir = PretrainedELMoMixin.download_checkpoint( + pretrained_model_name="elmo-small") + + info = list(os.walk(pretrained_model_dir)) + _, _, files = info[0] + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5', files) + self.assertIn('elmo_2x1024_128_2048cnn_1xhighway_options.json', files) + + model_config = PretrainedELMoMixin._transform_config( + pretrained_model_name="elmo-small", + cache_dir=pretrained_model_dir) + + exp_config = { + 'encoder': { + "lstm": { + "use_skip_connections": True, + "projection_dim": 128, + "cell_clip": 3, + "proj_clip": 3, + "dim": 1024, + "n_layers": 2 + }, + "char_cnn": { + "activation": "relu", + "filters": [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], + [6, 512], [7, 1024]], + "n_highway": 1, + "embedding": { + "dim": 16 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } + }, + } + + self.assertDictEqual(model_config, exp_config) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py new file mode 100644 index 000000000..65b8f2f69 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils.py @@ -0,0 +1,2166 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utils of ELMo Modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/common/checks.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/common/util.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo_lstm.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/encoder_base.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/lstm_cell_with_projection.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/highway.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/scalar_mix.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/time_distributed.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/modules/token_embedders/embedding.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/initializers.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py` +""" +import itertools +import json +import logging + +from itertools import islice +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, + Tuple, TypeVar, Union) + +import h5py +import numpy +import torch + +from torch.nn import ParameterList, Parameter +from torch.nn.functional import embedding +from torch.nn.utils.rnn import ( + pad_packed_sequence, pack_padded_sequence, PackedSequence) + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( + batch_to_ids, ELMoCharacterMapper) + +# pylint: disable=attribute-defined-outside-init,protected-access + +__all__ = [ + "_ElmoBiLm", + "_ElmoCharacterEncoder", + "_EncoderBase", + "ConfigurationError", + "ElmoLstm", + "Embedding", + "Highway", + "LstmCellWithProjection", + "ScalarMix", + "TimeDistributed", + "add_sentence_boundary_token_ids", + "block_orthogonal", + "combine_initial_dims", + "get_device_of", + "get_dropout_mask", + "get_lengths_from_binary_sequence_mask", + "lazy_groups_of", + "remove_sentence_boundaries", + "sort_batch_by_length", + "uncombine_initial_dims", +] + + +class _ElmoBiLm(torch.nn.Module): + r"""Run a pre-trained bidirectional language model, outputting the + activations at each layer for weighting together into an ELMo + representation (with `allennlp.modules.seq2seq_encoders.Elmo`). + This is a lower level class, useful for advanced uses, but most users + should use `allennlp.modules.Elmo` directly. + + # Parameters + + options_file : `str` + ELMo JSON options file + weight_file : `str` + ELMo hdf5 weight file + requires_grad : `bool`, optional, (default = False). + If True, compute gradient of ELMo parameters for fine tuning. + vocab_to_cache : `List[str]`, optional, (default = None). + A list of words to pre-compute and cache character convolutions + for. If you use this option, _ElmoBiLm expects that you pass word + indices of shape (batch_size, timesteps) to forward, instead + of character indices. If you use this option and pass a word which + wasn't pre-cached, this will break. + """ + + def __init__( + self, + options_file: str, + weight_file: str, + requires_grad: bool = False, + vocab_to_cache: Optional[List[str]] = None, + ) -> None: + super().__init__() + + self._token_embedder = _ElmoCharacterEncoder( + options_file, weight_file, requires_grad=requires_grad + ) + + self._requires_grad = requires_grad + if requires_grad and vocab_to_cache: + logging.warning( + "You are fine tuning ELMo and caching char CNN word vectors. " + "This behaviour is not guaranteed to be well defined, " + "particularly. " + "if not all of your inputs will occur in the vocabulary cache." + ) + # This is an embedding, used to look up cached + # word vectors built from character level cnn embeddings. + self._word_embedding = None + self._bos_embedding: torch.Tensor = None # type: ignore + self._eos_embedding: torch.Tensor = None # type: ignore + if vocab_to_cache: + logging.info( + "Caching character cnn layers for words in vocabulary.") + # This sets 3 attributes, _word_embedding, _bos_embedding and + # _eos_embedding. They are set in the method so they can be accessed + # from outside the constructor. + self.create_cached_cnn_embeddings(vocab_to_cache) + + with open(options_file, "r") as fin: + options = json.load(fin) + if not options["lstm"].get("use_skip_connections"): + raise ConfigurationError( + "We only support pretrained biLMs with residual connections") + self._elmo_lstm = ElmoLstm( + input_size=options["lstm"]["projection_dim"], + hidden_size=options["lstm"]["projection_dim"], + cell_size=options["lstm"]["dim"], + num_layers=options["lstm"]["n_layers"], + memory_cell_clip_value=options["lstm"]["cell_clip"], + state_projection_clip_value=options["lstm"]["proj_clip"], + requires_grad=requires_grad, + ) + + if weight_file is not None: + self._elmo_lstm.load_weights(weight_file) + # Number of representation layers including context independent layer + self.num_layers = options["lstm"]["n_layers"] + 1 + + def get_output_dim(self): + return 2 * self._token_embedder.get_output_dim() + + def forward( # type: ignore + self, inputs: torch.Tensor, word_inputs: Optional[torch.Tensor] = None + ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: + r"""Encodes the inputs. + + # Parameters + + inputs : `torch.Tensor`, required. + Shape `(batch_size, timesteps, 50)` of character ids representing + the current batch. + word_inputs : `torch.Tensor`, required. + If you passed a cached vocab, you can in addition pass a tensor of + shape `(batch_size, timesteps)`, which represent word ids which + have been pre-cached. + + # Returns + + Dict with keys: + + `'activations'` : `List[torch.Tensor]` + A list of activations at each layer of the network, each of shape + `(batch_size, timesteps + 2, embedding_dim)` + `'mask'`: `torch.Tensor` + Shape `(batch_size, timesteps + 2)` long tensor with sequence mask. + + Note that the output tensors all include additional special begin and + end of sequence markers. + """ + if self._word_embedding is not None and word_inputs is not None: + try: + mask_without_bos_eos = (word_inputs > 0).long() + # The character cnn part is cached - just look it up. + embedded_inputs = self._word_embedding( + word_inputs) + # shape (batch_size, timesteps + 2, embedding_dim) + type_representation, mask = add_sentence_boundary_token_ids( + embedded_inputs, mask_without_bos_eos, self._bos_embedding, + self._eos_embedding + ) + except RuntimeError: + # Back off to running the character convolutions, + # as we might not have the words in the cache. + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + else: + token_embedding = self._token_embedder(inputs) + mask = token_embedding["mask"] + type_representation = token_embedding["token_embedding"] + lstm_outputs = self._elmo_lstm(type_representation, mask) + + # Prepare the output. The first layer is duplicated. + # Because of minor differences in how masking is applied depending + # on whether the char cnn layers are cached, we'll be defensive and + # multiply by the mask here. It's not strictly necessary, as the + # mask passed on is correct, but the values in the padded areas + # of the char cnn representations can change. + output_tensors = [ + torch.cat([type_representation, type_representation], dim=-1) + * mask.float().unsqueeze(-1) + ] + for layer_activations in torch.chunk(lstm_outputs, + lstm_outputs.size(0), dim=0): + output_tensors.append(layer_activations.squeeze(0)) + + return {"activations": output_tensors, "mask": mask} + + def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: + r"""Given a list of tokens, this method precomputes word representations + by running just the character convolutions and highway layers of elmo, + essentially creating uncontextual word vectors. On subsequent forward + passes, the word ids are looked up from an embedding, rather than being + computed on the fly via the CNN encoder. + + This function sets 3 attributes: + + _word_embedding : `torch.Tensor` + The word embedding for each word in the tokens passed to this + method. + _bos_embedding : `torch.Tensor` + The embedding for the BOS token. + _eos_embedding : `torch.Tensor` + The embedding for the EOS token. + + # Parameters + + tokens : `List[str]`, required. + A list of tokens to precompute character convolutions for. + """ + tokens = [ELMoCharacterMapper.bos_token, + ELMoCharacterMapper.eos_token] + tokens + timesteps = 32 + batch_size = 32 + chunked_tokens = lazy_groups_of(iter(tokens), timesteps) + + all_embeddings = [] + device = get_device_of(next(self.parameters())) + for batch in lazy_groups_of(chunked_tokens, batch_size): + # Shape (batch_size, timesteps, 50) + batched_tensor = batch_to_ids(batch) + # NOTE: This device check is for when a user calls this method + # having already placed the model on a device. If this is called in + # the constructor, it will probably happen on the CPU. This isn't + # too bad, because it's only a few convolutions and will likely + # be very fast. + if device >= 0: + batched_tensor = batched_tensor.cuda(device) + output = self._token_embedder(batched_tensor) + token_embedding = output["token_embedding"] + mask = output["mask"] + token_embedding, _ = remove_sentence_boundaries(token_embedding, + mask) + all_embeddings.append(token_embedding.view( + -1, token_embedding.size(-1))) + full_embedding = torch.cat(all_embeddings, 0) + + # We might have some trailing embeddings from padding in the batch, so + # we clip the embedding and lookup to the right size. + full_embedding = full_embedding[: len(tokens), :] + embedding_ = full_embedding[2: len(tokens), :] + vocab_size, embedding_dim = list(embedding_.size()) + + self._bos_embedding = full_embedding[0, :] + self._eos_embedding = full_embedding[1, :] + self._word_embedding = Embedding( # type: ignore + vocab_size, + embedding_dim, + weight=embedding_.data, + trainable=self._requires_grad, + padding_index=0, + ) + + +class _ElmoCharacterEncoder(torch.nn.Module): + r"""Compute context insensitive token representation using pretrained biLM. + + This embedder has input character ids of size + (batch_size, sequence_length, 50) + and returns (batch_size, sequence_length + 2, embedding_dim), where + embedding_dim is specified in the options file (typically 512). + + We add special entries at the beginning and end of each sequence + corresponding to and , the beginning and end of sentence tokens. + + Note: this is a lower level class useful for advanced usage. Most users + should use `ElmoTokenEmbedder` or `allennlp.modules.Elmo` instead. + + # Parameters + + options_file : `str` + ELMo JSON options file + weight_file : `str` + ELMo hdf5 weight file + requires_grad : `bool`, optional, (default = False). + If True, compute gradient of ELMo parameters for fine tuning. + + The relevant section of the options file is something like: + .. example-code:: + + .. code-block:: python + + {'char_cnn': { + 'activation': 'relu', + 'embedding': {'dim': 4}, + 'filters': [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]], + 'max_characters_per_token': 50, + 'n_characters': 262, + 'n_highway': 2 + } + } + """ + + def __init__(self, options_file: str, weight_file: str, + requires_grad: bool = False) -> None: + super().__init__() + + with open(options_file, "r") as fin: + self._options = json.load(fin) + self._weight_file = weight_file + + self.output_dim = self._options["lstm"]["projection_dim"] + self.requires_grad = requires_grad + + if weight_file is not None: + self._load_weights() + else: + # Do not load the weights + self._load_weights(False) + + # Cache the arrays for use in forward -- +1 due to masking. + self._beginning_of_sentence_characters = torch.from_numpy( + numpy.array( + ELMoCharacterMapper.beginning_of_sentence_characters) + 1 + ) + self._end_of_sentence_characters = torch.from_numpy( + numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 + ) + + def get_output_dim(self): + return self.output_dim + + def forward(self, # type: ignore + inputs: torch.Tensor) -> Dict[str, torch.Tensor]: + r"""Compute context insensitive token embeddings for ELMo + representations. + + # Parameters + + inputs : `torch.Tensor` + Shape `(batch_size, sequence_length, 50)` of character ids + representing the current batch. + + # Returns + + Dict with keys: + `'token_embedding'` : `torch.Tensor` + Shape `(batch_size, sequence_length + 2, embedding_dim)` tensor + with context insensitive token representations. + `'mask'`: `torch.Tensor` + Shape `(batch_size, sequence_length + 2)` long tensor with + sequence mask. + """ + # Add BOS/EOS + mask = ((inputs > 0).long().sum(dim=-1) > 0).long() + character_ids_with_bos_eos, mask_with_bos_eos = \ + add_sentence_boundary_token_ids( + inputs, mask, self._beginning_of_sentence_characters, + self._end_of_sentence_characters) + + # the character id embedding + max_chars_per_token = \ + self._options["char_cnn"]["max_characters_per_token"] + # (batch_size * sequence_length, max_chars_per_token, embed_dim) + character_embedding = torch.nn.functional.embedding( + character_ids_with_bos_eos.view(-1, max_chars_per_token), + self._char_embedding_weights) + + # run convolutions + cnn_options = self._options["char_cnn"] + activation: Callable + if cnn_options["activation"] == "tanh": + activation = torch.tanh + elif cnn_options["activation"] == "relu": + activation = torch.nn.functional.relu + else: + raise ConfigurationError("Unknown activation") + + # (batch_size * sequence_length, embed_dim, max_chars_per_token) + character_embedding = torch.transpose(character_embedding, 1, 2) + convs = [] + for i in range(len(self._convolutions)): + conv = getattr(self, "char_conv_{}".format(i)) + convolved = conv(character_embedding) + # (batch_size * sequence_length, n_filters for this width) + convolved, _ = torch.max(convolved, dim=-1) + convolved = activation(convolved) + convs.append(convolved) + + # (batch_size * sequence_length, n_filters) + token_embedding = torch.cat(convs, dim=-1) + + # apply the highway layers (batch_size * sequence_length, n_filters) + token_embedding = self._highways(token_embedding) + + # final projection (batch_size * sequence_length, embedding_dim) + token_embedding = self._projection(token_embedding) + + # reshape to (batch_size, sequence_length, embedding_dim) + batch_size, sequence_length, _ = character_ids_with_bos_eos.size() + + return { + "mask": mask_with_bos_eos, + "token_embedding": token_embedding.view(batch_size, + sequence_length, -1), + } + + def _load_weights(self, load_weights=True): + self._load_char_embedding(load_weights) + self._load_cnn_weights(load_weights) + self._load_highway(load_weights) + self._load_projection(load_weights) + + def _load_char_embedding(self, load_weights): + + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + char_embed_weights = fin["char_embed"][...] + + weights = numpy.zeros( + (char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]), + dtype="float32" + ) + weights[1:, :] = char_embed_weights + + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad + ) + else: + weights = numpy.zeros( + (self._options['char_cnn']['n_characters'], + self._options['char_cnn']['embedding']['dim']), + dtype="float32" + ) + self._char_embedding_weights = torch.nn.Parameter( + torch.FloatTensor(weights), requires_grad=self.requires_grad + ) + + def _load_cnn_weights(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + char_embed_dim = cnn_options["embedding"]["dim"] + + convolutions = [] + for i, (width, num) in enumerate(filters): + conv = torch.nn.Conv1d( + in_channels=char_embed_dim, out_channels=num, + kernel_size=width, bias=True + ) + + if load_weights: + # load the weights + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN"]["W_cnn_{}".format(i)][...] + bias = fin["CNN"]["b_cnn_{}".format(i)][...] + + w_reshaped = numpy.transpose(weight.squeeze(axis=0), + axes=(2, 1, 0)) + if w_reshaped.shape != tuple(conv.weight.data.shape): + raise ValueError("Invalid weight file") + conv.weight.data.copy_(torch.FloatTensor(w_reshaped)) + conv.bias.data.copy_(torch.FloatTensor(bias)) + + conv.weight.requires_grad = self.requires_grad + conv.bias.requires_grad = self.requires_grad + + convolutions.append(conv) + self.add_module("char_conv_{}".format(i), conv) + + self._convolutions = convolutions + + def _load_highway(self, load_weights): + + # the highway layers have same dimensionality as the number of cnn + # filters + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + n_highway = cnn_options["n_highway"] + + # create the layers, and load the weights + self._highways = Highway(n_filters, n_highway, + activation=torch.nn.functional.relu) + + if load_weights: + for k in range(n_highway): + # The AllenNLP highway is one matrix multplication with + # concatenation of transform and carry weights. + with h5py.File(self._weight_file, "r") as fin: + # The weights are transposed due to multiplication order + # assumptions in tf vs pytorch (tf.matmul(X, W) vs + # pytorch.matmul(W, X)) + w_transform = numpy.transpose( + fin["CNN_high_{}".format(k)]["W_transform"][...]) + # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but + # tf is (1 - g) * x + g * f(x) + w_carry = -1.0 * numpy.transpose( + fin["CNN_high_{}".format(k)]["W_carry"][...]) + weight = numpy.concatenate([w_transform, w_carry], axis=0) + self._highways._layers[k].weight.data.copy_( + torch.FloatTensor(weight)) + self._highways._layers[k].weight.requires_grad = \ + self.requires_grad + + b_transform = \ + fin["CNN_high_{}".format(k)]["b_transform"][...] + b_carry = \ + -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...] + bias = numpy.concatenate([b_transform, b_carry], axis=0) + self._highways._layers[k].bias.data.copy_( + torch.FloatTensor(bias)) + self._highways._layers[k].bias.requires_grad = \ + self.requires_grad + + def _load_projection(self, load_weights): + cnn_options = self._options["char_cnn"] + filters = cnn_options["filters"] + n_filters = sum(f[1] for f in filters) + + self._projection = torch.nn.Linear(n_filters, self.output_dim, + bias=True) + + if load_weights: + with h5py.File(self._weight_file, "r") as fin: + weight = fin["CNN_proj"]["W_proj"][...] + bias = fin["CNN_proj"]["b_proj"][...] + self._projection.weight.data.copy_(torch.FloatTensor( + numpy.transpose(weight))) + self._projection.bias.data.copy_(torch.FloatTensor(bias)) + self._projection.weight.requires_grad = self.requires_grad + self._projection.bias.requires_grad = self.requires_grad + + +RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] +RnnStateStorage = Tuple[torch.Tensor, ...] + + +class _EncoderBase(torch.nn.Module): + r"""This abstract class serves as a base for the 3 `Encoder` abstractions + in AllenNLP. + - [`Seq2SeqEncoders`](./seq2seq_encoders/seq2seq_encoder.md) + - [`Seq2VecEncoders`](./seq2vec_encoders/seq2vec_encoder.md) + + Additionally, this class provides functionality for sorting sequences by + length so they can be consumed by Pytorch RNN classes, which require their + inputs to be sorted by length. Finally, it also provides optional + statefulness to all of it's subclasses by allowing the caching and + retrieving of the hidden states of RNNs. + """ + + def __init__(self, stateful: bool = False) -> None: + super().__init__() + self.stateful = stateful + self._states: Optional[RnnStateStorage] = None + + def sort_and_run_forward( + self, + module: Callable[ + [PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], RnnState], + ], + inputs: torch.Tensor, + mask: torch.Tensor, + hidden_state: Optional[RnnState] = None, + ): + r"""This function exists because Pytorch RNNs require that their inputs + be sorted before being passed as input. As all of our Seq2xxxEncoders + use this functionality, it is provided in a base class. This method can + be called on any module which takes as input a `PackedSequence` and + some `hidden_state`, which can either be a tuple of tensors or a tensor. + + As all of our Seq2xxxEncoders have different return types, we return + `sorted` outputs from the module, which is called directly. + Additionally, we return the indices into the batch dimension required + to restore the tensor to it's correct, unsorted order and the number of + valid batch elements (i.e the number of elements in the batch which are + not completely masked). This un-sorting and re-padding of the module + outputs is left to the subclasses because their outputs have different + types and handling them smoothly here is difficult. + + # Parameters + + module : `Callable[[PackedSequence, Optional[RnnState]], + Tuple[Union[PackedSequence, torch.Tensor], + RnnState]]`, required. + A function to run on the inputs. In most cases, this is a + `torch.nn.Module`. + inputs : `torch.Tensor`, required. + A tensor of shape `(batch_size, sequence_length, embedding_size)` + representing the inputs to the Encoder. + mask : `torch.Tensor`, required. + A tensor of shape `(batch_size, sequence_length)`, representing + masked and non-masked elements of the sequence for each element in + the batch. + hidden_state : `Optional[RnnState]`, (default = None). + A single tensor of shape (num_layers, batch_size, hidden_size) + representing the state of an RNN with or a tuple of tensors of + shapes (num_layers, batch_size, hidden_size) and + (num_layers, batch_size, memory_size), representing the hidden + state and memory state of an LSTM-like RNN. + + # Returns + + module_output : `Union[torch.Tensor, PackedSequence]`. + A Tensor or PackedSequence representing the output of the Pytorch + Module. The batch size dimension will be equal to `num_valid`, as + sequences of zero length are clipped off before the module is + called, as Pytorch cannot handle zero length sequences. + final_states : `Optional[RnnState]` + A Tensor representing the hidden state of the Pytorch Module. This + can either be a single tensor of shape (num_layers, num_valid, + hidden_size), for instance in the case of a GRU, or a tuple of + tensors, such as those required for an LSTM. + restoration_indices : `torch.LongTensor` + A tensor of shape `(batch_size,)`, describing the re-indexing + required to transform the outputs back to their original batch + order. + """ + # In some circumstances you may have sequences of zero length. + # `pack_padded_sequence` requires all sequence lengths to be > 0, so + # remove sequences of zero length before calling self._module, then + # fill with zeros. + + # First count how many sequences are empty. + batch_size = mask.size(0) + num_valid = torch.sum(mask[:, 0]).int().item() + + sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + ( + sorted_inputs, + sorted_sequence_lengths, + restoration_indices, + sorting_indices, + ) = sort_batch_by_length(inputs, sequence_lengths) + + # Now create a PackedSequence with only the non-empty, sorted sequences. + packed_sequence_input = pack_padded_sequence( + sorted_inputs[:num_valid, :, :], + sorted_sequence_lengths[:num_valid].data.tolist(), + batch_first=True, + ) + # Prepare the initial states. + if not self.stateful: + if hidden_state is None: + initial_states: Any = hidden_state + elif isinstance(hidden_state, tuple): + initial_states = [ + state.index_select( + 1, sorting_indices)[:, :num_valid, :].contiguous() + for state in hidden_state + ] + else: + initial_states = hidden_state.index_select(1, sorting_indices)[ + :, :num_valid, :].contiguous() + + else: + initial_states = self._get_initial_states(batch_size, num_valid, + sorting_indices) + + # Actually call the module on the sorted PackedSequence. + module_output, final_states = module(packed_sequence_input, + initial_states) + + return module_output, final_states, restoration_indices + + def _get_initial_states( + self, batch_size: int, num_valid: int, sorting_indices: torch.LongTensor + ) -> Optional[RnnState]: + r"""Returns an initial state for use in an RNN. Additionally, this + method handles the batch size changing across calls by mutating the + state to append initial states for new elements in the batch. Finally, + it also handles sorting the states with respect to the sequence lengths + of elements in the batch and removing rows which are completely padded. + Importantly, this `mutates` the state if the current batch size is + larger than when it was previously called. + + # Parameters + + batch_size : `int`, required. + The batch size can change size across calls to stateful RNNs, so we + need to know if we need to expand or shrink the states before + returning them. Expanded states will be set to zero. + num_valid : `int`, required. + The batch may contain completely padded sequences which get removed + before the sequence is passed through the encoder. We also need to + clip these off of the state too. + sorting_indices `torch.LongTensor`, required. + Pytorch RNNs take sequences sorted by length. When we return the + states to be used for a given call to `module.forward`, we need the + states to match up to the sorted sequences, so before returning + them, we sort the states using the same indices used to sort the + sequences. + + # Returns + + This method has a complex return type because it has to deal with the + first time it is called, when it has no state, and the fact that types + of RNN have heterogeneous states. + + If it is the first time the module has been called, it returns `None`, + regardless of the type of the `Module`. + + Otherwise, for LSTMs, it returns a tuple of `torch.Tensors` with shape + `(num_layers, num_valid, state_size)` and `(num_layers, num_valid, + memory_size)` respectively, or for GRUs, it returns a single + `torch.Tensor` of shape `(num_layers, num_valid, state_size)`. + """ + # We don't know the state sizes the first time calling forward, + # so we let the module define what it's initial hidden state looks like. + if self._states is None: + return None + + # Otherwise, we have some previous states. + if batch_size > self._states[0].size(1): + # This batch is larger than the all previous states. + # If so, resize the states. + num_states_to_concat = batch_size - self._states[0].size(1) + resized_states = [] + # state has shape (num_layers, batch_size, hidden_size) + for state in self._states: + # This _must_ be inside the loop because some + # RNNs have states with different last dimension sizes. + zeros = state.new_zeros(state.size(0), num_states_to_concat, + state.size(2)) + resized_states.append(torch.cat([state, zeros], 1)) + self._states = tuple(resized_states) + correctly_shaped_states = self._states + + elif batch_size < self._states[0].size(1): + # This batch is smaller than the previous one. + correctly_shaped_states = tuple(state[:, :batch_size, :] for state + in self._states) + else: + correctly_shaped_states = self._states + + # At this point, our states are of shape (num_layers, batch_size, + # hidden_size). However, the encoder uses sorted sequences and + # additionally removes elements of the batch which are fully padded. + # We need the states to match up to these sorted and filtered + # sequences, so we do that in the next two blocks before returning the + # state/s. + if len(self._states) == 1: + # GRUs only have a single state. This `unpacks` it from the + # tuple and returns the tensor directly. + correctly_shaped_state = correctly_shaped_states[0] + sorted_state = correctly_shaped_state.index_select( + 1, sorting_indices) + return sorted_state[:, :num_valid, :].contiguous() + else: + # LSTMs have a state tuple of (state, memory). + sorted_states = [ + state.index_select(1, sorting_indices) for state in + correctly_shaped_states + ] + return tuple(state[:, :num_valid, :].contiguous() # type: ignore + for state in sorted_states) + + def _update_states(self, final_states: RnnStateStorage, + restoration_indices: torch.LongTensor) -> None: + r"""After the RNN has run forward, the states need to be updated. + This method just sets the state to the updated new state, performing + several pieces of book-keeping along the way - namely, unsorting the + states and ensuring that the states of completely padded sequences are + not updated. Finally, it also detaches the state variable from the + computational graph, such that the graph can be garbage collected after + each batch iteration. + + # Parameters + + final_states : `RnnStateStorage`, required. + The hidden states returned as output from the RNN. + restoration_indices : `torch.LongTensor`, required. + The indices that invert the sorting used in `sort_and_run_forward` + to order the states with respect to the lengths of the sequences in + the batch. + """ + # TODO(Mark): seems weird to sort here, but append zeros in the + # subclasses. + # which way around is best? + new_unsorted_states = [state.index_select(1, restoration_indices) for + state in final_states] + + if self._states is None: + # We don't already have states, so just set the + # ones we receive to be the current state. + self._states = tuple(state.data for state in new_unsorted_states) + else: + # Now we've sorted the states back so that they correspond to the + # original indices, we need to figure out what states we need to + # update, because if we didn't use a state for a particular row, + # we want to preserve its state. Thankfully, the rows which are + # all zero in the state correspond exactly to those which aren't + # used, so we create masks of shape (new_batch_size,), denoting + # which states were used in the RNN computation. + current_state_batch_size = self._states[0].size(1) + new_state_batch_size = final_states[0].size(1) + # Masks for the unused states of shape (1, new_batch_size, 1) + used_new_rows_mask = [ + (state[0, :, :].sum(-1) != 0.0).float().view( + 1, new_state_batch_size, 1) + for state in new_unsorted_states + ] + new_states = [] + if current_state_batch_size > new_state_batch_size: + # The new state is smaller than the old one, + # so just update the indices which we used. + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask + ): + # zero out all rows in the previous state + # which _were_ used in the current state. + masked_old_state = \ + old_state[:, :new_state_batch_size, :] * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + old_state[:, :new_state_batch_size, :] = \ + new_state + masked_old_state + new_states.append(old_state.detach()) + else: + # The states are the same size, so we just have to + # deal with the possibility that some rows weren't used. + new_states = [] + for old_state, new_state, used_mask in zip( + self._states, new_unsorted_states, used_new_rows_mask + ): + # zero out all rows which _were_ used in the current state. + masked_old_state = old_state * (1 - used_mask) + # The old state is larger, so update the relevant parts of + # it. + new_state += masked_old_state + new_states.append(new_state.detach()) + + # It looks like there should be another case handled here - when + # the current_state_batch_size < new_state_batch_size. However, + # this never happens, because the states themeselves are mutated + # by appending zeros when calling _get_inital_states, meaning that + # the new states are either of equal size, or smaller, in the case + # that there are some unused elements (zero-length) for the RNN + # computation. + self._states = tuple(new_states) + + def reset_states(self, mask: Optional[torch.Tensor] = None) -> None: + r"""Resets the internal states of a stateful encoder. + + # Parameters + + mask : `torch.Tensor`, optional. + A tensor of shape `(batch_size,)` indicating which states should + be reset. If not provided, all states will be reset. + """ + if mask is None: + self._states = None + else: + # state has shape (num_layers, batch_size, hidden_size). We reshape + # mask to have shape (1, batch_size, 1) so that operations + # broadcast properly. + mask_batch_size = mask.size(0) + mask = mask.float().view(1, mask_batch_size, 1) + new_states = [] + assert self._states is not None + for old_state in self._states: + old_state_batch_size = old_state.size(1) + if old_state_batch_size != mask_batch_size: + raise ValueError( + f"Trying to reset states using mask with incorrect " + f"batch size. " + f"Expected batch size: {old_state_batch_size}. " + f"Provided batch size: {mask_batch_size}." + ) + new_state = (1 - mask) * old_state + new_states.append(new_state.detach()) + self._states = tuple(new_states) + + +class ElmoLstm(_EncoderBase): + r"""A stacked, bidirectional LSTM which uses + [`LstmCellWithProjection`'s](./lstm_cell_with_projection.md) + with highway layers between the inputs to layers. + The inputs to the forward and backward directions are independent - + forward and backward states are not concatenated between layers. + + Additionally, this LSTM maintains its `own` state, which is updated every + time `forward` is called. It is dynamically resized for different batch + sizes and is designed for use with non-continuous inputs (i.e inputs which + aren't formatted as a stream, such as text used for a language modeling + task, which is how stateful RNNs are typically used). + This is non-standard, but can be thought of as having an "end of sentence" + state, which is carried across different sentences. + + # Parameters + + input_size : `int`, required + The dimension of the inputs to the LSTM. + hidden_size : `int`, required + The dimension of the outputs of the LSTM. + cell_size : `int`, required. + The dimension of the memory cell of the `LstmCellWithProjection`. + num_layers : `int`, required + The number of bidirectional LSTMs to use. + requires_grad : `bool`, optional + If True, compute gradient of ELMo parameters for fine tuning. + recurrent_dropout_probability : `float`, optional (default = 0.0) + The dropout probability to be used in a dropout scheme as stated in + [A Theoretically Grounded Application of Dropout in Recurrent Neural + Networks](https://arxiv.org/abs/1512.05287). + state_projection_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the hidden_state after projecting it. + memory_cell_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the memory cell. + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + cell_size: int, + num_layers: int, + requires_grad: bool = False, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None, + ) -> None: + super().__init__(stateful=True) + + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.cell_size = cell_size + self.requires_grad = requires_grad + + forward_layers = [] + backward_layers = [] + + lstm_input_size = input_size + go_forward = True + for layer_index in range(num_layers): + forward_layer = LstmCellWithProjection( + lstm_input_size, + hidden_size, + cell_size, + go_forward, + recurrent_dropout_probability, + memory_cell_clip_value, + state_projection_clip_value, + ) + backward_layer = LstmCellWithProjection( + lstm_input_size, + hidden_size, + cell_size, + not go_forward, + recurrent_dropout_probability, + memory_cell_clip_value, + state_projection_clip_value, + ) + lstm_input_size = hidden_size + + self.add_module("forward_layer_{}".format(layer_index), + forward_layer) + self.add_module("backward_layer_{}".format(layer_index), + backward_layer) + forward_layers.append(forward_layer) + backward_layers.append(backward_layer) + self.forward_layers = forward_layers + self.backward_layers = backward_layers + + def forward(self, inputs: torch.Tensor, # type: ignore + mask: torch.LongTensor) -> torch.Tensor: + r"""Encodes the inputs. + + # Parameters + + inputs : `torch.Tensor`, required. + A Tensor of shape `(batch_size, sequence_length, hidden_size)`. + mask : `torch.LongTensor`, required. + A binary mask of shape `(batch_size, sequence_length)` representing + the non-padded elements in each sequence in the batch. + + # Returns + + A `torch.Tensor` of shape (num_layers, batch_size, sequence_length, + hidden_size), where the num_layers dimension represents the LSTM output + from that layer. + """ + batch_size, total_sequence_length = mask.size() + stacked_sequence_output, final_states, restoration_indices = \ + self.sort_and_run_forward(self._lstm_forward, inputs, mask) + + num_layers, num_valid, returned_timesteps, encoder_dim = \ + stacked_sequence_output.size() + # Add back invalid rows which were removed in the call to + # sort_and_run_forward. + if num_valid < batch_size: + zeros = stacked_sequence_output.new_zeros( + num_layers, batch_size - num_valid, returned_timesteps, + encoder_dim + ) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 1) + + # The states also need to have invalid rows added back. + new_states = [] + for state in final_states: + state_dim = state.size(-1) + zeros = state.new_zeros(num_layers, batch_size - num_valid, + state_dim) + new_states.append(torch.cat([state, zeros], 1)) + final_states = new_states + + # It's possible to need to pass sequences which are padded to longer + # than the max length of the sequence to a Seq2StackEncoder. However, + # packing and unpacking the sequences mean that the returned tensor + # won't include these dimensions, because the RNN did not need to + # process them. We add them back on in the form of zeros here. + sequence_length_difference = total_sequence_length - returned_timesteps + if sequence_length_difference > 0: + zeros = stacked_sequence_output.new_zeros( + num_layers, + batch_size, + sequence_length_difference, + stacked_sequence_output[0].size(-1), + ) + stacked_sequence_output = torch.cat( + [stacked_sequence_output, zeros], 2) + + self._update_states(final_states, restoration_indices) + + # Restore the original indices and return the sequence. + # Has shape (num_layers, batch_size, sequence_length, hidden_size) + return stacked_sequence_output.index_select(1, restoration_indices) + + def _lstm_forward( + self, + inputs: PackedSequence, + initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + r"""Encodes the inputs. + + # Parameters + + inputs : `PackedSequence`, required. + A batch first `PackedSequence` to run the stacked LSTM over. + initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, + (default = None) + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM, with shape + (num_layers, batch_size, 2 * hidden_size) and + (num_layers, batch_size, 2 * cell_size) respectively. + + # Returns + + output_sequence : `torch.FloatTensor` + The encoded sequence of shape + (num_layers, batch_size, sequence_length, hidden_size) + final_states : `Tuple[torch.FloatTensor, torch.FloatTensor]` + The per-layer final (state, memory) states of the LSTM, with shape + (num_layers, batch_size, 2 * hidden_size) and + (num_layers, batch_size, 2 * cell_size) + respectively. The last dimension is duplicated because it + contains the state/memory for both the forward and backward layers. + """ + if initial_state is None: + hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = \ + [None] * len(self.forward_layers) + elif initial_state[0].size()[0] != len(self.forward_layers): + raise ConfigurationError( + "Initial states were passed to forward() but the number of " + "initial states does not match the number of layers." + ) + else: + hidden_states = list(zip(initial_state[0].split(1, 0), + initial_state[1].split(1, 0))) + + inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) + forward_output_sequence = inputs + backward_output_sequence = inputs + + final_states = [] + sequence_outputs = [] + for layer_index, state in enumerate(hidden_states): + forward_layer = getattr(self, "forward_layer_{}".format( + layer_index)) + backward_layer = getattr(self, "backward_layer_{}".format( + layer_index)) + + forward_cache = forward_output_sequence + backward_cache = backward_output_sequence + + if state is not None: + forward_hidden_state, backward_hidden_state = state[0].split( + self.hidden_size, 2) + forward_memory_state, backward_memory_state = state[1].split( + self.cell_size, 2) + forward_state = (forward_hidden_state, forward_memory_state) + backward_state = (backward_hidden_state, backward_memory_state) + else: + forward_state = None # type: ignore + backward_state = None # type: ignore + + forward_output_sequence, forward_state = forward_layer( + forward_output_sequence, batch_lengths, forward_state + ) + backward_output_sequence, backward_state = backward_layer( + backward_output_sequence, batch_lengths, backward_state + ) + # Skip connections, just adding the input to the output. + if layer_index != 0: + forward_output_sequence += forward_cache + backward_output_sequence += backward_cache + + sequence_outputs.append( + torch.cat([forward_output_sequence, backward_output_sequence], + -1) + ) + # Append the state tuples in a list, so that we can return + # the final states for all the layers. + final_states.append( + ( + torch.cat([forward_state[0], backward_state[0]], -1), + torch.cat([forward_state[1], backward_state[1]], -1), + ) + ) + + stacked_sequence_outputs: torch.FloatTensor = torch.stack( + sequence_outputs) + # Stack the hidden state and memory for each layer into 2 tensors of + # shape (num_layers, batch_size, hidden_size) and + # (num_layers, batch_size, cell_size) respectively. + final_hidden_states, final_memory_states = zip(*final_states) + final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = ( + torch.cat(final_hidden_states, 0), + torch.cat(final_memory_states, 0), + ) + return stacked_sequence_outputs, final_state_tuple + + def load_weights(self, weight_file: str) -> None: + r"""Load the pre-trained weights from the file. + """ + requires_grad = self.requires_grad + + with h5py.File(weight_file, "r") as fin: + for i_layer, lstms in enumerate(zip(self.forward_layers, + self.backward_layers)): + for j_direction, lstm in enumerate(lstms): + # lstm is an instance of LSTMCellWithProjection + cell_size = lstm.cell_size + + dataset = fin["RNN_%s" % j_direction]["RNN"][ + "MultiRNNCell"][ + "Cell%s" % i_layer + ]["LSTMCell"] + + # tensorflow packs together both W and U matrices into one + # matrix, but pytorch maintains individual matrices. In + # addition, tensorflow packs the gates as input, memory, + # forget, output but pytorch uses input, forget, memory, + # output. So we need to modify the weights. + tf_weights = numpy.transpose(dataset["W_0"][...]) + torch_weights = tf_weights.copy() + + # split the W from U matrices + input_size = lstm.input_size + input_weights = torch_weights[:, :input_size] + recurrent_weights = torch_weights[:, input_size:] + tf_input_weights = tf_weights[:, :input_size] + tf_recurrent_weights = tf_weights[:, input_size:] + + # handle the different gate order convention + for torch_w, tf_w in [ + [input_weights, tf_input_weights], + [recurrent_weights, tf_recurrent_weights], + ]: + torch_w[(1 * cell_size): (2 * cell_size), :] = tf_w[ + (2 * cell_size): (3 * cell_size), : + ] + torch_w[(2 * cell_size): (3 * cell_size), :] = tf_w[ + (1 * cell_size): (2 * cell_size), : + ] + + lstm.input_linearity.weight.data.copy_(torch.FloatTensor( + input_weights)) + lstm.state_linearity.weight.data.copy_(torch.FloatTensor( + recurrent_weights)) + lstm.input_linearity.weight.requires_grad = requires_grad + lstm.state_linearity.weight.requires_grad = requires_grad + + # the bias weights + tf_bias = dataset["B"][...] + # tensorflow adds 1.0 to forget gate bias instead of + # modifying the parameters... + tf_bias[(2 * cell_size): (3 * cell_size)] += 1 + torch_bias = tf_bias.copy() + torch_bias[(1 * cell_size): (2 * cell_size)] = tf_bias[ + (2 * cell_size): (3 * cell_size) + ] + torch_bias[(2 * cell_size): (3 * cell_size)] = tf_bias[ + (1 * cell_size): (2 * cell_size) + ] + lstm.state_linearity.bias.data.copy_(torch.FloatTensor( + torch_bias)) + lstm.state_linearity.bias.requires_grad = requires_grad + + # the projection weights + proj_weights = numpy.transpose(dataset["W_P_0"][...]) + lstm.state_projection.weight.data.copy_(torch.FloatTensor( + proj_weights)) + lstm.state_projection.weight.requires_grad = requires_grad + + +class LstmCellWithProjection(torch.nn.Module): + r"""An LSTM with Recurrent Dropout and a projected and clipped hidden state + and memory. Note: this implementation is slower than the native Pytorch + LSTM because it cannot make use of CUDNN optimizations for stacked RNNs due + to and variational dropout and the custom nature of the cell state. + + # Parameters + + input_size : `int`, required. + The dimension of the inputs to the LSTM. + hidden_size : `int`, required. + The dimension of the outputs of the LSTM. + cell_size : `int`, required. + The dimension of the memory cell used for the LSTM. + go_forward : `bool`, optional (default = True) + The direction in which the LSTM is applied to the sequence. + Forwards by default, or backwards if False. + recurrent_dropout_probability : `float`, optional (default = 0.0) + The dropout probability to be used in a dropout scheme as stated in + [A Theoretically Grounded Application of Dropout in Recurrent Neural + Networks] (https://arxiv.org/abs/1512.05287). Implementation wise, + this simply applies a fixed dropout mask per sequence to the recurrent + connection of the LSTM. + state_projection_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the hidden_state after projecting it. + memory_cell_clip_value : `float`, optional, (default = None) + The magnitude with which to clip the memory cell. + + # Returns + + output_accumulator : `torch.FloatTensor` + The outputs of the LSTM for each timestep. A tensor of shape + (batch_size, max_timesteps, hidden_size) where for a given batch + element, all outputs past the sequence length for that batch are + zero tensors. + final_state : `Tuple[torch.FloatTensor, torch.FloatTensor]` + The final (state, memory) states of the LSTM, with shape + (1, batch_size, hidden_size) and (1, batch_size, cell_size) + respectively. The first dimension is 1 in order to match the Pytorch + API for returning stacked LSTM states. + """ + + def __init__( + self, + input_size: int, + hidden_size: int, + cell_size: int, + go_forward: bool = True, + recurrent_dropout_probability: float = 0.0, + memory_cell_clip_value: Optional[float] = None, + state_projection_clip_value: Optional[float] = None, + ) -> None: + super().__init__() + # Required to be wrapped with a `PytorchSeq2SeqWrapper`. + self.input_size = input_size + self.hidden_size = hidden_size + self.cell_size = cell_size + + self.go_forward = go_forward + self.state_projection_clip_value = state_projection_clip_value + self.memory_cell_clip_value = memory_cell_clip_value + self.recurrent_dropout_probability = recurrent_dropout_probability + + # We do the projections for all the gates all at once. + self.input_linearity = torch.nn.Linear( + input_size, 4 * cell_size, bias=False) + self.state_linearity = torch.nn.Linear( + hidden_size, 4 * cell_size, bias=True) + + # Additional projection matrix for making the hidden state smaller. + self.state_projection = torch.nn.Linear( + cell_size, hidden_size, bias=False) + self.reset_parameters() + + def reset_parameters(self): + # Use sensible default initializations for parameters. + block_orthogonal(self.input_linearity.weight.data, + [self.cell_size, self.input_size]) + block_orthogonal(self.state_linearity.weight.data, + [self.cell_size, self.hidden_size]) + + self.state_linearity.bias.data.fill_(0.0) + # Initialize forget gate biases to 1.0 as per An Empirical + # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015). + self.state_linearity.bias.data[self.cell_size: + 2 * self.cell_size].fill_(1.0) + + def forward( # type: ignore + self, + inputs: torch.FloatTensor, + batch_lengths: List[int], + initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ): + r"""Process the inputs. + + # Parameters + + inputs : `torch.FloatTensor`, required. + A tensor of shape (batch_size, num_timesteps, input_size) + to apply the LSTM over. + batch_lengths : `List[int]`, required. + A list of length batch_size containing the lengths of the sequences + in batch. + initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, + (default = None) + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM. The `state` has shape (1, batch_size, + hidden_size) and the `memory` has shape (1, batch_size, cell_size). + + # Returns + + output_accumulator : `torch.FloatTensor` + The outputs of the LSTM for each timestep. A tensor of shape + (batch_size, max_timesteps, hidden_size) where for a given batch + element, all outputs past the sequence length for that batch are + zero tensors. + final_state : `Tuple[`torch.FloatTensor, torch.FloatTensor]` + A tuple (state, memory) representing the initial hidden state and + memory of the LSTM. The `state` has shape (1, batch_size, + hidden_size) and the `memory` has shape (1, batch_size, cell_size). + """ + batch_size = inputs.size()[0] + total_timesteps = inputs.size()[1] + + output_accumulator = inputs.new_zeros(batch_size, total_timesteps, + self.hidden_size) + + if initial_state is None: + full_batch_previous_memory = inputs.new_zeros(batch_size, + self.cell_size) + full_batch_previous_state = inputs.new_zeros(batch_size, + self.hidden_size) + else: + full_batch_previous_state = initial_state[0].squeeze(0) + full_batch_previous_memory = initial_state[1].squeeze(0) + + current_length_index = batch_size - 1 if self.go_forward else 0 + if self.recurrent_dropout_probability > 0.0 and self.training: + dropout_mask = get_dropout_mask( + self.recurrent_dropout_probability, full_batch_previous_state + ) + else: + dropout_mask = None + + for timestep in range(total_timesteps): + # The index depends on which end we start. + index = timestep if self.go_forward else \ + total_timesteps - timestep - 1 + + # What we are doing here is finding the index into the batch + # dimension which we need to use for this timestep, because the + # sequences have variable length, so once the index is greater than + # the length of this particular batch sequence, we no longer need + # to do the computation for this sequence. The key thing to + # recognise here is that the batch inputs must be _ordered_ by + # length from longest (first in batch) to shortest (last) so + # initially, we are going forwards with every sequence and as we + # pass the index at which the shortest elements of the batch finish, + # we stop picking them up for the computation. + if self.go_forward: + while batch_lengths[current_length_index] <= index: + current_length_index -= 1 + # If we're going backwards, we are _picking up_ more indices. + else: + # First conditional: Are we already at the maximum number of + # elements in the batch? + # Second conditional: Does the next shortest sequence beyond + # the current batch index require computation use this timestep? + while ( + current_length_index < (len(batch_lengths) - 1) + and batch_lengths[current_length_index + 1] > index + ): + current_length_index += 1 + + # Actually get the slices of the batch which we + # need for the computation at this timestep. + # shape (batch_size, cell_size) + previous_memory = \ + full_batch_previous_memory[0: current_length_index + 1].clone() + # Shape (batch_size, hidden_size) + previous_state = \ + full_batch_previous_state[0: current_length_index + 1].clone() + # Shape (batch_size, input_size) + timestep_input = inputs[0: current_length_index + 1, index] + + # Do the projections for all the gates all at once. + # Both have shape (batch_size, 4 * cell_size) + projected_input = self.input_linearity(timestep_input) + projected_state = self.state_linearity(previous_state) + + # Main LSTM equations using relevant chunks of the big linear + # projections of the hidden state and inputs. + input_gate = torch.sigmoid( + projected_input[:, (0 * self.cell_size): (1 * self.cell_size)] + + projected_state[:, (0 * self.cell_size): (1 * self.cell_size)] + ) + forget_gate = torch.sigmoid( + projected_input[:, (1 * self.cell_size): (2 * self.cell_size)] + + projected_state[:, (1 * self.cell_size): (2 * self.cell_size)] + ) + memory_init = torch.tanh( + projected_input[:, (2 * self.cell_size): (3 * self.cell_size)] + + projected_state[:, (2 * self.cell_size): (3 * self.cell_size)] + ) + output_gate = torch.sigmoid( + projected_input[:, (3 * self.cell_size): (4 * self.cell_size)] + + projected_state[:, (3 * self.cell_size): (4 * self.cell_size)] + ) + memory = input_gate * memory_init + forget_gate * previous_memory + + # Here is the non-standard part of this LSTM cell; first, we clip + # the memory cell, then we project the output of the timestep to a + # smaller size and again clip it. + + if self.memory_cell_clip_value: + + memory = torch.clamp( + memory, -self.memory_cell_clip_value, + self.memory_cell_clip_value + ) + + # shape (current_length_index, cell_size) + pre_projection_timestep_output = output_gate * torch.tanh(memory) + + # shape (current_length_index, hidden_size) + timestep_output = self.state_projection( + pre_projection_timestep_output) + if self.state_projection_clip_value: + + timestep_output = torch.clamp( + timestep_output, + -self.state_projection_clip_value, + self.state_projection_clip_value, + ) + + # Only do dropout if the dropout prob is > 0.0 and we are in + # training mode. + if dropout_mask is not None: + timestep_output = \ + timestep_output * dropout_mask[0: current_length_index + 1] + + # We've been doing computation with less than the full batch, so + # here we create a new variable for the the whole batch at this + # timestep and insert the result for the relevant elements of the + # batch into it. + full_batch_previous_memory = full_batch_previous_memory.clone() + full_batch_previous_state = full_batch_previous_state.clone() + full_batch_previous_memory[0: current_length_index + 1] = memory + full_batch_previous_state[0: current_length_index + 1] = \ + timestep_output + output_accumulator[0: current_length_index + 1, index] = \ + timestep_output + + # Mimic the pytorch API by returning state in the following shape: + # (num_layers * num_directions, batch_size, ...). As this + # LSTM cell cannot be stacked, the first dimension here is just 1. + final_state = ( + full_batch_previous_state.unsqueeze(0), + full_batch_previous_memory.unsqueeze(0), + ) + + return output_accumulator, final_state + + +class Highway(torch.nn.Module): + r"""A [Highway layer](https://arxiv.org/abs/1505.00387) does a gated + combination of a linear transformation and a non-linear transformation of + its input. :math:`y = g * x + (1 - g) * f(A(x))`, + where :math:`A` is a linear transformation, :math:`f` is an element-wise + non-linearity, and :math:`g` is an element-wise gate, computed + as :math:`sigmoid(B(x))`. + + This module will apply a fixed number of highway layers to its input, + returning the final result. + + # Parameters + + input_dim : `int`, required + The dimensionality of :math:`x`. We assume the input has shape + `(batch_size, ..., input_dim)`. + num_layers : `int`, optional (default=`1`) + The number of highway layers to apply to the input. + activation : `Callable[[torch.Tensor], torch.Tensor]`, optional + (default=`torch.nn.functional.relu`) + The non-linearity to use in the highway layers. + """ + + def __init__(self, input_dim: int, num_layers: int = 1, + activation: Callable[[torch.Tensor], torch.Tensor] = + torch.nn.functional.relu,) -> None: + super().__init__() + self._input_dim = input_dim + self._layers = torch.nn.ModuleList( + [torch.nn.Linear(input_dim, input_dim * 2) + for _ in range(num_layers)] + ) + self._activation = activation + for layer in self._layers: + # We should bias the highway layer to just carry its input forward. + # We do that by setting the bias on `B(x)` to be positive, because + # that means `g` will be biased to be high, so we will carry the + # input forward. The bias on `B(x)` is the second half of the + # bias vector in each Linear layer. + layer.bias[input_dim:].data.fill_(1) # type: ignore + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: # type: ignore + current_input = inputs + for layer in self._layers: + projected_input = layer(current_input) + linear_part = current_input + # NOTE: if you modify this, think about whether you should modify + # the initialization above, too. + nonlinear_part, gate = projected_input.chunk(2, dim=-1) + nonlinear_part = self._activation(nonlinear_part) + gate = torch.sigmoid(gate) + current_input = gate * linear_part + (1 - gate) * nonlinear_part + return current_input + + +class Embedding(torch.nn.Module): + r"""A more featureful embedding module than the default in Pytorch. Adds + the ability to: + + 1. embed higher-order inputs + 2. pre-specify the weight matrix + 3. use a non-trainable embedding + 4. project the resultant embeddings to some other dimension (which only + makes sense with non-trainable embeddings). + 5. build all of this easily `from_params` + + Note that if you are using our data API and are trying to embed a + [`TextField`](../../data/fields/text_field.md), you should use a + [`TextFieldEmbedder`](../text_field_embedders/text_field_embedder.md) + instead of using this directly. + + # Parameters + + num_embeddings : `int` + Size of the dictionary of embeddings (vocabulary size). + embedding_dim : `int` + The size of each embedding vector. + projection_dim : `int`, (optional, default=None) + If given, we add a projection layer after the embedding layer. This + really only makes sense if `trainable` is `False`. + weight : `torch.FloatTensor`, (optional, default=None) + A pre-initialised weight matrix for the embedding lookup, allowing the + use of pretrained vectors. + padding_index : `int`, (optional, default=None) + If given, pads the output with zeros whenever it encounters the index. + trainable : `bool`, (optional, default=True) + Whether or not to optimize the embedding parameters. + max_norm : `float`, (optional, default=None) + If given, will renormalize the embeddings to always have a norm lesser + than this + norm_type : `float`, (optional, default=2) + The p of the p-norm to compute for the max_norm option + scale_grad_by_freq : `bool`, (optional, default=False) + If given, this will scale gradients by the frequency of the words in + the mini-batch. + sparse : `bool`, (optional, default=False) + Whether or not the Pytorch backend should use a sparse representation + of the embedding weight. + vocab_namespace : `str`, (optional, default=None) + In case of fine-tuning/transfer learning, the model's embedding matrix + needs to be extended according to the size of extended-vocabulary. To + be able to know how much to extend the embedding-matrix, it's necessary + to know which vocab_namspace was used to construct it in the original + training. We store vocab_namespace used during the original training as + an attribute, so that it can be retrieved during fine-tuning. + pretrained_file : `str`, (optional, default=None) + Used to keep track of what is the source of the weights and loading + more embeddings at test time. **It does not load the weights from this + pretrained_file.** For that purpose, use `Embedding.from_params`. + + # Returns + + An Embedding module. + """ + + default_implementation = "embedding" + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + projection_dim: Optional[int] = None, + weight: Optional[torch.FloatTensor] = None, + padding_index: Optional[int] = None, + trainable: bool = True, + max_norm: Optional[float] = None, + norm_type: float = 2.0, + scale_grad_by_freq: bool = False, + sparse: bool = False, + vocab_namespace: Optional[str] = None, + pretrained_file: Optional[str] = None, + ) -> None: + super().__init__() + self.num_embeddings = num_embeddings + self.padding_index = padding_index + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + self._vocab_namespace = vocab_namespace + self._pretrained_file = pretrained_file + + self.output_dim = projection_dim or embedding_dim + + if weight is None: + weight = torch.FloatTensor(num_embeddings, embedding_dim) + self.weight = torch.nn.Parameter(weight, requires_grad=trainable) + torch.nn.init.xavier_uniform_(self.weight) + else: + if weight.size() != (num_embeddings, embedding_dim): + raise ConfigurationError( + "A weight matrix was passed with contradictory embedding " + "shapes." + ) + self.weight = torch.nn.Parameter(weight, + requires_grad=trainable) + + if self.padding_index is not None: + self.weight.data[self.padding_index].fill_(0) + + if projection_dim: + self._projection = torch.nn.Linear(embedding_dim, projection_dim) + else: + self._projection = None # type: ignore + + def forward(self, tokens: torch.Tensor) -> torch.Tensor: # type: ignore + # tokens may have extra dimensions + # (batch_size, d1, ..., dn, sequence_length), + # but embedding expects (batch_size, sequence_length), so pass tokens to + # util.combine_initial_dims (which is a no-op if there are no extra + # dimensions). Remember the original size. + original_size = tokens.size() + tokens = combine_initial_dims(tokens) + + embedded = embedding( + tokens, + self.weight, + padding_idx=self.padding_index, + max_norm=self.max_norm, + norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, + sparse=self.sparse, + ) + + # Now (if necessary) add back in the extra dimensions. + embedded = uncombine_initial_dims(embedded, original_size) + + if self._projection: + projection = self._projection + for _ in range(embedded.dim() - 2): + projection = TimeDistributed(projection) # type: ignore + embedded = projection(embedded) + return embedded + + +class TimeDistributed(torch.nn.Module): + r"""Given an input shaped like `(batch_size, time_steps, [rest])` and a + `Module` that takes inputs like `(batch_size, [rest])`, `TimeDistributed` + reshapes the input to be `(batch_size * time_steps, [rest])`, applies the + contained `Module`, then reshapes it back. + + Note that while the above gives shapes with `batch_size` first, this + `Module` also works if `batch_size` is second - we always just combine the + first two dimensions, then split them. + + It also reshapes keyword arguments unless they are not tensors or their + name is specified in the optional `pass_through` iterable. + """ + + def __init__(self, module): + super().__init__() + self._module = module + + def forward(self, *inputs, + pass_through: Optional[List[str]] = None, **kwargs): + + pass_through = pass_through or [] + + reshaped_inputs = [self._reshape_tensor(input_tensor) + for input_tensor in inputs] + + # Need some input to then get the batch_size and time_steps. + some_input = None + if inputs: + some_input = inputs[-1] + + reshaped_kwargs = {} + for key, value in kwargs.items(): + if isinstance(value, torch.Tensor) and key not in pass_through: + if some_input is None: + some_input = value + + value = self._reshape_tensor(value) + + reshaped_kwargs[key] = value + + reshaped_outputs = self._module(*reshaped_inputs, **reshaped_kwargs) + + if some_input is None: + raise RuntimeError("No input tensor to time-distribute") + + # Now get the output back into the right shape. + # (batch_size, time_steps, **output_size) + new_size = some_input.size()[:2] + reshaped_outputs.size()[1:] + outputs = reshaped_outputs.contiguous().view(new_size) + + return outputs + + @staticmethod + def _reshape_tensor(input_tensor): + input_size = input_tensor.size() + if len(input_size) <= 2: + raise RuntimeError(f"No dimension to distribute: {input_size}") + # Squash batch_size and time_steps into a single axis; result has shape + # (batch_size * time_steps, **input_size). + squashed_shape = [-1] + list(input_size[2:]) + return input_tensor.contiguous().view(*squashed_shape) + + +def add_sentence_boundary_token_ids( + tensor: torch.Tensor, mask: torch.Tensor, + sentence_begin_token: Any, sentence_end_token: Any) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Add begin/end of sentence tokens to the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)` this returns a tensor of shape + `(batch_size, timesteps + 2)` or `(batch_size, timesteps + 2, dim)` + respectively. + + Returns both the new tensor and updated mask. + + # Parameters + + tensor : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` or + `(batch_size, timesteps, dim)` + mask : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` + sentence_begin_token: Any (anything that can be broadcast in torch for + assignment) + For 2D input, a scalar with the id. For 3D input, a tensor with + length dim. + sentence_end_token: Any (anything that can be broadcast in torch for + assignment) + For 2D input, a scalar with the id. For 3D input, a tensor with + length dim. + + # Returns + + tensor_with_boundary_tokens : `torch.Tensor` + The tensor with the appended and prepended boundary tokens. If the + input was 2D, it has shape (batch_size, timesteps + 2) and if the + input was 3D, it has shape (batch_size, timesteps + 2, dim). + new_mask : `torch.Tensor` + The new mask for the tensor, taking into account the appended tokens + marking the beginning and end of the sentence. + """ + # TODO: matthewp, profile this transfer + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] + 2 + tensor_with_boundary_tokens = tensor.new_zeros(*new_shape) + if len(tensor_shape) == 2: + tensor_with_boundary_tokens[:, 1:-1] = tensor + tensor_with_boundary_tokens[:, 0] = sentence_begin_token + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, j + 1] = sentence_end_token + new_mask = (tensor_with_boundary_tokens != 0).long() + elif len(tensor_shape) == 3: + tensor_with_boundary_tokens[:, 1:-1, :] = tensor + for i, j in enumerate(sequence_lengths): + tensor_with_boundary_tokens[i, 0, :] = sentence_begin_token + tensor_with_boundary_tokens[i, j + 1, :] = sentence_end_token + new_mask = ( + (tensor_with_boundary_tokens > 0).long().sum(dim=-1) > 0).long() + else: + raise ValueError( + "add_sentence_boundary_token_ids only accepts 2D and 3D input") + + return tensor_with_boundary_tokens, new_mask + + +def get_device_of(tensor: torch.Tensor) -> int: + r"""Returns the device of the tensor. + """ + if not tensor.is_cuda: + return -1 + else: + return tensor.get_device() + + +def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor]: + r"""Remove begin/end of sentence embeddings from the batch of sentences. + Given a batch of sentences with size `(batch_size, timesteps, dim)` + this returns a tensor of shape `(batch_size, timesteps - 2, dim)` after + removing the beginning and end sentence markers. The sentences are + assumed to be padded on the right, with the beginning of each sentence + assumed to occur at index 0 (i.e., `mask[:, 0]` is assumed to be 1). + + Returns both the new tensor and updated mask. + + This function is the inverse of `add_sentence_boundary_token_ids`. + + # Parameters + + tensor : `torch.Tensor` + A tensor of shape `(batch_size, timesteps, dim)` + mask : `torch.Tensor` + A tensor of shape `(batch_size, timesteps)` + + # Returns + + tensor_without_boundary_tokens : `torch.Tensor` + The tensor after removing the boundary tokens of shape + `(batch_size, timesteps - 2, dim)` + new_mask : `torch.Tensor` + The new mask for the tensor of shape `(batch_size, timesteps - 2)`. + """ + # TODO: matthewp, profile this transfer + sequence_lengths = mask.sum(dim=1).detach().cpu().numpy() + tensor_shape = list(tensor.data.shape) + new_shape = list(tensor_shape) + new_shape[1] = tensor_shape[1] - 2 + tensor_without_boundary_tokens = tensor.new_zeros(*new_shape) + new_mask = tensor.new_zeros((new_shape[0], new_shape[1]), dtype=torch.long) + for i, j in enumerate(sequence_lengths): + if j > 2: + tensor_without_boundary_tokens[i, : (j - 2), :] = \ + tensor[i, 1: (j - 1), :] + new_mask[i, : (j - 2)] = 1 + + return tensor_without_boundary_tokens, new_mask + + +A = TypeVar("A") + + +def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: + r"""Takes an iterable and batches the individual instances into lists of the + specified size. The last list may be smaller if there are instances left + over. + """ + iterator = iter(iterable) + while True: + s = list(islice(iterator, group_size)) + if len(s) > 0: + yield s + else: + break + + +class ConfigurationError(Exception): + r"""The exception raised by any AllenNLP object when it's misconfigured + (e.g. missing properties, invalid properties, unknown properties). + """ + + def __init__(self, message): + super().__init__() + self.message = message + + def __str__(self): + # TODO(brendanr): Is there some reason why we need repr here? It + # produces horrible output for simple multi-line error messages. + return self.message + + +def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): + r"""Compute sequence lengths for each batch element in a tensor using a + binary mask. + + # Parameters + + mask : torch.Tensor, required. + A 2D binary mask of shape (batch_size, sequence_length) to + calculate the per-batch sequence lengths from. + + # Returns + + A torch.LongTensor of shape (batch_size,) representing the lengths + of the sequences in the batch. + """ + return mask.long().sum(-1) + + +def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): + r"""Sort a batch first tensor by some specified lengths. + + # Parameters + + tensor : torch.FloatTensor, required. + A batch first Pytorch tensor. + sequence_lengths : torch.LongTensor, required. + A tensor representing the lengths of some dimension of the tensor which + we want to sort by. + + # Returns + + sorted_tensor : torch.FloatTensor + The original tensor sorted along the batch dimension with respect to + sequence_lengths. + sorted_sequence_lengths : torch.LongTensor + The original sequence_lengths sorted by decreasing size. + restoration_indices : torch.LongTensor + Indices into the sorted_tensor such that + `sorted_tensor.index_select(0, restoration_indices) == original_tensor` + permutation_index : torch.LongTensor + The indices used to sort the tensor. This is useful if you want to sort + many tensors using the same ordering. + """ + + if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, + torch.Tensor): + raise ConfigurationError( + "Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort( + 0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), + device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return (sorted_tensor, sorted_sequence_lengths, restoration_indices, + permutation_index) + + +def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], + gain: float = 1.0) -> None: + r"""An initializer which allows initializing model parameters in "blocks". + This is helpful in the case of recurrent models which use multiple gates + applied to linear projections, which can be computed efficiently if they + are concatenated together. However, they are separate parameters which + should be initialized independently. + + # Parameters + + tensor : `torch.Tensor`, required. + A tensor to initialize. + split_sizes : List[int], required. + A list of length `tensor.ndim()` specifying the size of the + blocks along that particular dimension. E.g. `[10, 20]` would + result in the tensor being split into chunks of size 10 along the + first dimension and 20 along the second. + gain : float, optional (default = 1.0) + The gain (scaling) applied to the orthogonal initialization. + """ + data = tensor.data + sizes = list(tensor.size()) + if any(a % b != 0 for a, b in zip(sizes, split_sizes)): + raise ConfigurationError( + "tensor dimensions must be divisible by their respective " + "split_sizes. Found size: {} and split_sizes: {}".format( + sizes, split_sizes) + ) + indexes = [list(range(0, max_size, split)) for max_size, split in zip( + sizes, split_sizes)] + # Iterate over all possible blocks within the tensor. + for block_start_indices in itertools.product(*indexes): + # A list of tuples containing the index to start at for this block + # and the appropriate step size (i.e split_size[i] for dimension i). + index_and_step_tuples = zip(block_start_indices, split_sizes) + # This is a tuple of slices corresponding to: + # tensor[index: index + step_size, ...]. This is required because we + # could have an arbitrary number of dimensions. The actual slices we + # need are the start_index: start_index + step for each dimension in + # the tensor. + block_slice = tuple( + slice(start_index, start_index + step) for start_index, step in + index_and_step_tuples + ) + data[block_slice] = torch.nn.init.orthogonal_( + tensor[block_slice].contiguous(), gain=gain) + + +def get_dropout_mask(dropout_probability: float, + tensor_for_masking: torch.Tensor): + r"""Computes and returns an element-wise dropout mask for a given tensor, + where each element in the mask is dropped out with probability + dropout_probability. Note that the mask is NOT applied to the tensor - + the tensor is passed to retain the correct CUDA tensor type for the mask. + + # Parameters + + dropout_probability : float, required. + Probability of dropping a dimension of the input. + tensor_for_masking : torch.Tensor, required. + + # Returns + + A torch.FloatTensor consisting of the binary mask scaled by + 1/ (1 - dropout_probability). + This scaling ensures expected values and variances of the output of + applying this mask and the original tensor are the same. + """ + binary_mask = ( + torch.rand(tensor_for_masking.size()) > dropout_probability).to( + tensor_for_masking.device + ) + # Scale mask by 1/keep_prob to preserve output statistics. + dropout_mask = binary_mask.float().div(1.0 - dropout_probability) + return dropout_mask + + +def combine_initial_dims(tensor: torch.Tensor) -> torch.Tensor: + r"""Given a (possibly higher order) tensor of ids with shape + (d1, ..., dn, sequence_length) Return a view that's + (d1 * ... * dn, sequence_length). If original tensor is 1-d or 2-d, + return it as is. + """ + if tensor.dim() <= 2: + return tensor + else: + return tensor.view(-1, tensor.size(-1)) + + +def uncombine_initial_dims(tensor: torch.Tensor, original_size: torch.Size) -> \ + torch.Tensor: + r"""Given a tensor of embeddings with shape + (d1 * ... * dn, sequence_length, embedding_dim) and the original shape + (d1, ..., dn, sequence_length), return the reshaped tensor of embeddings + with shape (d1, ..., dn, sequence_length, embedding_dim). + If original size is 1-d or 2-d, return it as is. + """ + if len(original_size) <= 2: + return tensor + else: + view_args = list(original_size) + [tensor.size(-1)] + return tensor.view(*view_args) + + +class ScalarMix(torch.nn.Module): + r"""Computes a parameterised scalar mixture of N tensors, + `mixture = gamma * sum(s_k * tensor_k)` where `s = softmax(w)`, with `w` + and `gamma` scalar parameters. + + In addition, if `do_layer_norm=True` then apply layer normalization to + each tensor before weighting. + """ + + def __init__(self, mixture_size: int, do_layer_norm: bool = False, + initial_scalar_parameters: Optional[List[float]] = None, + trainable: bool = True,) -> None: + super().__init__() + self.mixture_size = mixture_size + self.do_layer_norm = do_layer_norm + + if initial_scalar_parameters is None: + initial_scalar_parameters = [0.0] * mixture_size + elif len(initial_scalar_parameters) != mixture_size: + raise ConfigurationError( + "Length of initial_scalar_parameters {} differs " + "from mixture_size {}".format(initial_scalar_parameters, + mixture_size) + ) + + self.scalar_parameters = ParameterList( + [ + Parameter( + torch.FloatTensor([initial_scalar_parameters[i]]), + requires_grad=trainable + ) + for i in range(mixture_size) + ] + ) + self.gamma = Parameter(torch.FloatTensor([1.0]), + requires_grad=trainable) + + def forward(self, tensors: List[torch.Tensor], # type: ignore + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + r"""Compute a weighted average of the `tensors`. The input tensors an + be any shape with at least two dimensions, but must all be the same + shape. + + When `do_layer_norm=True`, the `mask` is required input. If the + `tensors` are dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the + `mask` is dimensioned `(dim_0, ..., dim_{n-1})`, as in the typical + case with `tensors` of shape `(batch_size, timesteps, dim)` and `mask` + of shape `(batch_size, timesteps)`. + + When `do_layer_norm=False` the `mask` is ignored. + """ + if len(tensors) != self.mixture_size: + raise ConfigurationError( + "{} tensors were passed, but the module was initialized to " + "mix {} tensors.".format(len(tensors), self.mixture_size) + ) + + def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): + tensor_masked = tensor * broadcast_mask + mean = torch.sum(tensor_masked) / num_elements_not_masked + variance = ( + torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) / + num_elements_not_masked + ) + return (tensor - mean) / torch.sqrt(variance + 1e-12) + + # pylint: disable=unnecessary-comprehension + normed_weights = torch.nn.functional.softmax( + torch.cat([parameter for parameter in self.scalar_parameters]), + dim=0 + ) + normed_weights = torch.split(normed_weights, split_size_or_sections=1) + + if not self.do_layer_norm: + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append(weight * tensor) + return self.gamma * sum(pieces) + + else: + assert mask is not None + mask_float = mask.float() + broadcast_mask = mask_float.unsqueeze(-1) + input_dim = tensors[0].size(-1) + num_elements_not_masked = torch.sum(mask_float) * input_dim + + pieces = [] + for weight, tensor in zip(normed_weights, tensors): + pieces.append( + weight * _do_layer_norm(tensor, broadcast_mask, + num_elements_not_masked) + ) + return self.gamma * sum(pieces) diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py new file mode 100644 index 000000000..34d826241 --- /dev/null +++ b/texar/torch/modules/pretrained/elmo_utils_test.py @@ -0,0 +1,882 @@ +# Copyright 2019 The Texar Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for utils of ELMo modules. + +Code adapted from: + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/common/util_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/elmo_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/encoder_base_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/lstm_cell_with_projection_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/highway_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/modules/time_distributed_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/initializers_test.py` + `https://github.com/allenai/allennlp/blob/master/allennlp/tests/nn/util_test.py` +""" + +import unittest + +import h5py +import json +import numpy +import tempfile +import torch + +from numpy.testing import assert_array_almost_equal, assert_almost_equal +from torch.nn import LSTM, RNN, Embedding, Module, Parameter + +from texar.torch.data.tokenizers.elmo_tokenizer_utils import batch_to_ids +from texar.torch.data.data_utils import maybe_download +from texar.torch.modules.pretrained.elmo_utils import ( + Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed, + sort_batch_by_length, get_lengths_from_binary_sequence_mask, + remove_sentence_boundaries, add_sentence_boundary_token_ids, + lazy_groups_of, block_orthogonal, ConfigurationError, combine_initial_dims, + uncombine_initial_dims, ScalarMix) +from texar.torch.utils.test import cuda_test + + +class TestElmoBiLm(unittest.TestCase): + + def setUp(self): + super().setUp() + self.tmp_dir = tempfile.TemporaryDirectory() + self.options_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/options.json?raw=true', + self.tmp_dir.name) + self.weight_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/lm_weights.hdf5?raw=true', + self.tmp_dir.name) + self.sentences_json_file = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/tests/' + 'fixtures/elmo/sentences.json?raw=true', + self.tmp_dir.name) + + def tearDown(self): + self.tmp_dir.cleanup() + + def _load_sentences_embeddings(self): + r"""Load the test sentences and the expected LM embeddings. + + These files loaded in this method were created with a batch-size of 3. + Due to idiosyncrasies with TensorFlow, the 30 sentences in + sentences.json are split into 3 files in which the k-th sentence in + each is from batch k. + + This method returns a (sentences, embeddings) pair where each is a + list of length batch_size. Each list contains a sublist with + total_sentence_count / batch_size elements. As with the original files, + the k-th element in the sublist is in batch k. + """ + with open(self.sentences_json_file) as fin: + sentences = json.load(fin) + + # the expected embeddings + expected_lm_embeddings = [] + for k in range(len(sentences)): + embed_fname = maybe_download( + 'https://github.com/allenai/allennlp/blob/master/allennlp/' + 'tests/fixtures/elmo/lm_embeddings_{}.hdf5?raw=true'.format(k), + self.tmp_dir.name) + expected_lm_embeddings.append([]) + with h5py.File(embed_fname, "r") as fin: + for i in range(10): + sent_embeds = fin["%s" % i][...] + sent_embeds_concat = numpy.concatenate( + (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1 + ) + expected_lm_embeddings[-1].append(sent_embeds_concat) + + return sentences, expected_lm_embeddings + + def test_elmo_bilm(self): + # get the raw data + sentences, expected_lm_embeddings = self._load_sentences_embeddings() + + # load the test model + elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) + + batches = [[sentences[j][i].split() for j in range(3)] + for i in range(10)] + + # Now finally we can iterate through batches. + for i, batch in enumerate(batches): + lm_embeddings = elmo_bilm(batch_to_ids(batch[:3])) + top_layer_embeddings, mask = remove_sentence_boundaries( + lm_embeddings["activations"][2], lm_embeddings["mask"] + ) + + # check the mask lengths + lengths = mask.data.numpy().sum(axis=1) + batch_sentences = [sentences[k][i] for k in range(3)] + expected_lengths = [len(sentence.split()) for sentence in + batch_sentences] + self.assertEqual(lengths.tolist(), expected_lengths) + + # get the expected embeddings and compare! + expected_top_layer = [expected_lm_embeddings[k][i] for k in + range(3)] + for k in range(3): + self.assertTrue( + numpy.allclose( + top_layer_embeddings[k, : lengths[k], :].data.numpy(), + expected_top_layer[k], + atol=1.0e-6, + ) + ) + + +class TestEncoderBase(unittest.TestCase): + + def setUp(self): + super().setUp() + self.lstm = LSTM( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True + ) + self.rnn = RNN( + bidirectional=True, num_layers=3, input_size=3, hidden_size=7, + batch_first=True + ) + self.encoder_base = _EncoderBase(stateful=True) + + tensor = torch.rand([5, 7, 3]) + tensor[1, 6:, :] = 0 + tensor[3, 2:, :] = 0 + self.tensor = tensor + mask = torch.ones(5, 7) + mask[1, 6:] = 0 + mask[2, :] = 0 # <= completely masked + mask[3, 2:] = 0 + mask[4, :] = 0 # <= completely masked + self.mask = mask + + self.batch_size = 5 + self.num_valid = 3 + sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + _, _, restoration_indices, sorting_indices = sort_batch_by_length( + tensor, sequence_lengths) + self.sorting_indices = sorting_indices + self.restoration_indices = restoration_indices + + def test_non_stateful_states_are_sorted_correctly(self): + encoder_base = _EncoderBase(stateful=False) + initial_states = (torch.randn(6, 5, 7), torch.randn(6, 5, 7)) + # Check that we sort the state for non-stateful encoders. To test + # we'll just use a "pass through" encoder, as we aren't actually testing + # the functionality of the encoder here anyway. + _, states, restoration_indices = encoder_base.sort_and_run_forward( + lambda *x: x, self.tensor, self.mask, initial_states + ) + # Our input tensor had 2 zero length sequences, so we need + # to concat a tensor of shape + # (num_layers * num_directions, batch_size - num_valid, hidden_dim), + # to the output before unsorting it. + zeros = torch.zeros([6, 2, 7]) + + # sort_and_run_forward strips fully-padded instances from the batch; + # in order to use the restoration_indices we need to add back the two + # that got stripped. What we get back should match what we started with. + for state, original in zip(states, initial_states): + assert list(state.size()) == [6, 3, 7] + state_with_zeros = torch.cat([state, zeros], 1) + unsorted_state = state_with_zeros.index_select(1, + restoration_indices) + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + unsorted_state[:, index, :].data.numpy(), + original[:, index, :].data.numpy() + ) + + def test_get_initial_states(self): + # First time we call it, there should be no state, so we should return + # None. + assert ( + self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + is None + ) + + # First test the case that the previous state is _smaller_ than the + # current state input. + initial_states = (torch.randn([1, 3, 7]), torch.randn([1, 3, 7])) + self.encoder_base._states = initial_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + + correct_expanded_states = [ + torch.cat([state, torch.zeros([1, 2, 7])], 1) + for state in initial_states + ] + # State should have been expanded with zeros to have shape + # (1, batch_size, hidden_size). + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + correct_expanded_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + correct_expanded_states[1].data.numpy() + ) + + # The returned states should be of shape (1, num_valid, hidden_size) and + # they also should have been sorted with respect to the indices. + # sorting indices are: [0, 1, 3, 2, 4] + + correct_returned_states = [ + state.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for state in correct_expanded_states + ] + + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_states[1].data.numpy() + ) + + # Now test the case that the previous state is larger: + original_states = (torch.randn([1, 10, 7]), torch.randn([1, 10, 7])) + self.encoder_base._states = original_states + # sorting indices are: [0, 1, 3, 2, 4] + returned_states = self.encoder_base._get_initial_states( + self.batch_size, self.num_valid, self.sorting_indices + ) + # State should not have changed, as they were larger + # than the batch size of the requested states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + original_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + original_states[1].data.numpy() + ) + + # The returned states should be of shape (1, num_valid, hidden_size) + # and they also should have been sorted with respect to the indices. + correct_returned_state = [ + x.index_select(1, self.sorting_indices)[:, : self.num_valid, :] + for x in original_states + ] + numpy.testing.assert_array_equal( + returned_states[0].data.numpy(), + correct_returned_state[0].data.numpy() + ) + numpy.testing.assert_array_equal( + returned_states[1].data.numpy(), + correct_returned_state[1].data.numpy() + ) + + def test_update_states(self): + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices), + ) + + self.encoder_base._update_states(initial_states, + self.restoration_indices) + # State was None, so the updated state should just be the sorted given + # state. + numpy.testing.assert_array_equal( + self.encoder_base._states[0].data.numpy(), + index_selected_initial_states[0].data.numpy() + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1].data.numpy(), + index_selected_initial_states[1].data.numpy() + ) + + new_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + # tensor has 2 completely masked rows, so the last 2 rows of the _ + # sorted_ states will be completely zero, having been appended after + # calling the respective encoder. + new_states[0][:, -2:, :] = 0 + new_states[1][:, -2:, :] = 0 + + index_selected_new_states = ( + new_states[0].index_select(1, self.restoration_indices), + new_states[1].index_select(1, self.restoration_indices), + ) + + self.encoder_base._update_states(new_states, self.restoration_indices) + # Check that the update _preserved_ the state for the rows which were + # completely masked (2 and 4): + for index in [2, 4]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_initial_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_initial_states[1][:, index, :].data.numpy(), + ) + # Now the states which were updated: + for index in [0, 1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(), + ) + + # Now test the case that the new state is smaller: + small_new_states = torch.randn([1, 3, 7]), torch.randn([1, 3, 7]) + # pretend the 2nd sequence in the batch was fully masked. + small_restoration_indices = torch.LongTensor([2, 0, 1]) + small_new_states[0][:, 0, :] = 0 + small_new_states[1][:, 0, :] = 0 + + index_selected_small_states = ( + small_new_states[0].index_select(1, small_restoration_indices), + small_new_states[1].index_select(1, small_restoration_indices), + ) + self.encoder_base._update_states(small_new_states, + small_restoration_indices) + + # Check the index for the row we didn't update is the same as the + # previous step: + for index in [1, 3]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_new_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_new_states[1][:, index, :].data.numpy(), + ) + # Indices we did update: + for index in [0, 2]: + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, index, :].data.numpy(), + index_selected_small_states[0][:, index, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, index, :].data.numpy(), + index_selected_small_states[1][:, index, :].data.numpy(), + ) + + # We didn't update index 4 in the previous step either, so it should + # be equal to the 4th index of initial states. + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 4, :].data.numpy(), + index_selected_initial_states[0][:, 4, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 4, :].data.numpy(), + index_selected_initial_states[1][:, 4, :].data.numpy(), + ) + + def test_reset_states(self): + # Initialize the encoder states. + assert self.encoder_base._states is None + initial_states = torch.randn([1, 5, 7]), torch.randn([1, 5, 7]) + index_selected_initial_states = ( + initial_states[0].index_select(1, self.restoration_indices), + initial_states[1].index_select(1, self.restoration_indices), + ) + self.encoder_base._update_states(initial_states, + self.restoration_indices) + + # Check that only some of the states are reset when a mask is provided. + mask = torch.FloatTensor([1, 1, 0, 0, 0]) + self.encoder_base.reset_states(mask) + # First two states should be zeros + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[0])[:, :2, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, :2, :].data.numpy(), + torch.zeros_like(initial_states[1])[:, :2, :].data.numpy(), + ) + # Remaining states should be the same + numpy.testing.assert_array_equal( + self.encoder_base._states[0][:, 2:, :].data.numpy(), + index_selected_initial_states[0][:, 2:, :].data.numpy(), + ) + numpy.testing.assert_array_equal( + self.encoder_base._states[1][:, 2:, :].data.numpy(), + index_selected_initial_states[1][:, 2:, :].data.numpy(), + ) + + # Check that error is raised if mask has wrong batch size. + bad_mask = torch.FloatTensor([1, 1, 0]) + with self.assertRaises(ValueError): + self.encoder_base.reset_states(bad_mask) + + # Check that states are reset to None if no mask is provided. + self.encoder_base.reset_states() + assert self.encoder_base._states is None + + def test_non_contiguous_initial_states_handled(self): + # Check that the encoder is robust to non-contiguous initial states. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False) + initial_states = ( + torch.randn(5, 6, 7).permute(1, 0, 2), + torch.randn(5, 6, 7).permute(1, 0, 2), + ) + assert not initial_states[0].is_contiguous() and \ + not initial_states[1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward(self.lstm, self.tensor, + self.mask, initial_states) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, + self.mask, initial_states[0]) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if + # its state was previously updated with non-contiguous tensors. As in + # the non-stateful tests, we check that the encoder still works on + # initial states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True) + encoder_base._update_states(final_states, self.restoration_indices) + encoder_base.sort_and_run_forward(self.lstm, self.tensor, self.mask) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], self.restoration_indices) + encoder_base.sort_and_run_forward(self.rnn, self.tensor, self.mask) + + @cuda_test + def test_non_contiguous_initial_states_handled_on_gpu(self): + # Some PyTorch operations which produce contiguous tensors on the CPU + # produce non-contiguous tensors on the GPU (e.g. forward pass of an + # RNN when batch_first=True). Accordingly, we perform the same checks + # from previous test on the GPU to ensure the encoder is not affected + # by which device it is on. + + # Case 1: Encoder is not stateful + + # A transposition will make the tensors non-contiguous, start them off + # at the wrong shape and transpose them into the right shape. + encoder_base = _EncoderBase(stateful=False).cuda() + initial_states = ( + torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + torch.randn(5, 6, 7).cuda().permute(1, 0, 2), + ) + assert not initial_states[0].is_contiguous() and not initial_states[ + 1].is_contiguous() + assert initial_states[0].size() == torch.Size([6, 5, 7]) + assert initial_states[1].size() == torch.Size([6, 5, 7]) + + # We'll pass them through an LSTM encoder and a vanilla RNN encoder to + # make sure it works whether the initial states are a tuple of tensors + # or just a single tensor. + encoder_base.sort_and_run_forward( + self.lstm.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states + ) + encoder_base.sort_and_run_forward( + self.rnn.cuda(), self.tensor.cuda(), self.mask.cuda(), + initial_states[0] + ) + + # Case 2: Encoder is stateful + + # For stateful encoders, the initial state may be non-contiguous if its + # state was previously updated with non-contiguous tensors. As in the + # non-stateful tests, we check that the encoder still works on initial + # states for RNNs and LSTMs. + final_states = initial_states + # Check LSTM + encoder_base = _EncoderBase(stateful=True).cuda() + encoder_base._update_states(final_states, + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.lstm.cuda(), self.tensor.cuda(), + self.mask.cuda()) + # Check RNN + encoder_base.reset_states() + encoder_base._update_states([final_states[0]], + self.restoration_indices.cuda()) + encoder_base.sort_and_run_forward(self.rnn.cuda(), self.tensor.cuda(), + self.mask.cuda()) + + +class TestHighway(unittest.TestCase): + + def test_forward_works_on_simple_input(self): + highway = Highway(2, 2) + + highway._layers[0].weight.data.fill_(1) + highway._layers[0].bias.data.fill_(0) + highway._layers[1].weight.data.fill_(2) + highway._layers[1].bias.data.fill_(-2) + input_tensor = torch.FloatTensor([[-2, 1], [3, -2]]) + result = highway(input_tensor).data.numpy() + assert result.shape == (2, 2) + # This was checked by hand. + assert_almost_equal(result, [[-0.0394, 0.0197], [1.7527, -0.5550]], + decimal=4) + + def test_forward_works_on_nd_input(self): + highway = Highway(2, 2) + input_tensor = torch.ones(2, 2, 2) + output = highway(input_tensor) + assert output.size() == (2, 2, 2) + + +class TestLstmCellWithProjection(unittest.TestCase): + + def test_elmo_lstm_cell_completes_forward_pass(self): + input_tensor = torch.rand(4, 5, 3) + input_tensor[1, 4:, :] = 0.0 + input_tensor[2, 2:, :] = 0.0 + input_tensor[3, 1:, :] = 0.0 + + initial_hidden_state = torch.ones([1, 4, 5]) + initial_memory_state = torch.ones([1, 4, 7]) + + lstm = LstmCellWithProjection( + input_size=3, + hidden_size=5, + cell_size=7, + memory_cell_clip_value=2, + state_projection_clip_value=1, + ) + output_sequence, lstm_state = lstm( + input_tensor, [5, 4, 2, 1], (initial_hidden_state, + initial_memory_state) + ) + numpy.testing.assert_array_equal( + output_sequence.data[1, 4:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[2, 2:, :].numpy(), 0.0) + numpy.testing.assert_array_equal( + output_sequence.data[3, 1:, :].numpy(), 0.0) + + # Test the state clipping. + numpy.testing.assert_array_less(output_sequence.data.numpy(), 1.0) + numpy.testing.assert_array_less(-output_sequence.data.numpy(), 1.0) + + # LSTM state should be (num_layers, batch_size, hidden_size) + assert list(lstm_state[0].size()) == [1, 4, 5] + # LSTM memory cell should be (num_layers, batch_size, cell_size) + assert list((lstm_state[1].size())) == [1, 4, 7] + + # Test the cell clipping. + numpy.testing.assert_array_less(lstm_state[0].data.numpy(), 2.0) + numpy.testing.assert_array_less(-lstm_state[0].data.numpy(), 2.0) + + +class TestTimeDistributed(unittest.TestCase): + + def test_time_distributed_reshapes_named_arg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter( + torch.FloatTensor([[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] + ) + + def test_time_distributed_reshapes_positional_kwarg_correctly(self): + char_embedding = Embedding(2, 2) + char_embedding.weight = Parameter(torch.FloatTensor( + [[0.4, 0.4], [0.5, 0.5]])) + distributed_embedding = TimeDistributed(char_embedding) + char_input = torch.LongTensor([[[1, 0], [1, 1]]]) + output = distributed_embedding(input=char_input) + assert_almost_equal( + output.data.numpy(), + [[[[0.5, 0.5], [0.4, 0.4]], [[0.5, 0.5], [0.5, 0.5]]]] + ) + + def test_time_distributed_works_with_multiple_inputs(self): + module = lambda x, y: x + y + distributed = TimeDistributed(module) + x_input = torch.LongTensor([[[1, 2], [3, 4]]]) + y_input = torch.LongTensor([[[4, 2], [9, 1]]]) + output = distributed(x_input, y_input) + assert_almost_equal(output.data.numpy(), [[[5, 4], [12, 5]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_tensor_correctly(self): + + class FakeModule(Module): + + def forward(self, input_tensor, tensor_to_pass_through=None, + another_tensor=None): + + return input_tensor + tensor_to_pass_through + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_to_pass_through = torch.LongTensor([3, 7]) + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + tensor_to_pass_through=input_to_pass_through, + another_tensor=input_tensor2, + pass_through=["tensor_to_pass_through"], + ) + assert_almost_equal(output.data.numpy(), [[[8, 11], [15, 12]]]) + + def test_time_distributed_reshapes_multiple_inputs_with_pass_through_non_tensor_correctly(self): + + class FakeModule(Module): + + def forward(self, input_tensor, number=0, another_tensor=None): + + return input_tensor + number + another_tensor + + module = FakeModule() + distributed_module = TimeDistributed(module) + + input_tensor1 = torch.LongTensor([[[1, 2], [3, 4]]]) + input_number = 5 + input_tensor2 = torch.LongTensor([[[4, 2], [9, 1]]]) + + output = distributed_module( + input_tensor1, + number=input_number, + another_tensor=input_tensor2, + pass_through=["number"], + ) + assert_almost_equal(output.data.numpy(), [[[10, 9], [17, 10]]]) + + +class TestUtils(unittest.TestCase): + + def test_add_sentence_boundary_token_ids_handles_2D_input(self): + tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]])) + mask = (tensor > 0).long() + bos = 9 + eos = 10 + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array([[9, 1, 2, 3, 10], [9, 4, 5, 10, 0]]) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all() + + def test_add_sentence_boundary_token_ids_handles_3D_input(self): + tensor = torch.from_numpy( + numpy.array( + [ + [[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], + [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]], + ] + ) + ) + mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor) + bos = torch.from_numpy(numpy.array([9, 9, 9, 9])) + eos = torch.from_numpy(numpy.array([10, 10, 10, 10])) + new_tensor, new_mask = add_sentence_boundary_token_ids( + tensor, mask, bos, eos) + expected_new_tensor = numpy.array( + [ + [[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], + [10, 10, 10, 10]], + [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], + [0, 0, 0, 0]], + ] + ) + assert (new_tensor.data.numpy() == expected_new_tensor).all() + assert (new_mask.data.numpy() == ( + (expected_new_tensor > 0).sum(axis=-1) > 0)).all() + + def test_remove_sentence_boundaries(self): + tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) + mask = torch.from_numpy( + # The mask with two elements is to test the corner case + # of an empty sequence, so here we are removing boundaries + # from " " + numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + ).long() + new_tensor, new_mask = remove_sentence_boundaries(tensor, mask) + + expected_new_tensor = torch.zeros(3, 3, 7) + expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] + expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] + assert_array_almost_equal(new_tensor.data.numpy(), + expected_new_tensor.data.numpy()) + + expected_new_mask = torch.from_numpy(numpy.array( + [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() + assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() + + def test_lazy_groups_of(self): + xs = [1, 2, 3, 4, 5, 6, 7] + groups = lazy_groups_of(iter(xs), group_size=3) + assert next(groups) == [1, 2, 3] + assert next(groups) == [4, 5, 6] + assert next(groups) == [7] + with self.assertRaises(StopIteration): + _ = next(groups) + + def test_get_sequence_lengths_from_binary_mask(self): + binary_mask = torch.ByteTensor( + [[1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0]] + ) + lengths = get_lengths_from_binary_sequence_mask(binary_mask) + numpy.testing.assert_array_equal(lengths.numpy(), + numpy.array([3, 2, 6, 1])) + + def test_sort_tensor_by_length(self): + tensor = torch.rand([5, 7, 9]) + tensor[0, 3:, :] = 0 + tensor[1, 4:, :] = 0 + tensor[2, 1:, :] = 0 + tensor[3, 5:, :] = 0 + + sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) + sorted_tensor, sorted_lengths, reverse_indices, _ = \ + sort_batch_by_length(tensor, sequence_lengths) + + # Test sorted indices are padded correctly. + numpy.testing.assert_array_equal( + sorted_tensor[1, 5:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[2, 4:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[3, 3:, :].data.numpy(), 0.0) + numpy.testing.assert_array_equal( + sorted_tensor[4, 1:, :].data.numpy(), 0.0) + + assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) + + # Test restoration indices correctly recover the original tensor. + assert sorted_tensor.index_select(0, reverse_indices).data.equal( + tensor.data) + + def test_block_orthogonal_can_initialize(self): + tensor = torch.zeros([10, 6]) + block_orthogonal(tensor, [5, 3]) + tensor = tensor.data.numpy() + + def test_block_is_orthogonal(block) -> None: + matrix_product = block.T @ block + numpy.testing.assert_array_almost_equal( + matrix_product, numpy.eye(matrix_product.shape[-1]), 6 + ) + + test_block_is_orthogonal(tensor[:5, :3]) + test_block_is_orthogonal(tensor[:5, 3:]) + test_block_is_orthogonal(tensor[5:, 3:]) + test_block_is_orthogonal(tensor[5:, :3]) + + def test_block_orthogonal_raises_on_mismatching_dimensions(self): + tensor = torch.zeros([10, 6, 8]) + with self.assertRaises(ConfigurationError): + block_orthogonal(tensor, [7, 2, 1]) + + def test_combine_initial_dims(self): + tensor = torch.randn(4, 10, 20, 17, 5) + + tensor2d = combine_initial_dims(tensor) + assert list(tensor2d.size()) == [4 * 10 * 20 * 17, 5] + + def test_uncombine_initial_dims(self): + embedding2d = torch.randn(4 * 10 * 20 * 17 * 5, 12) + + embedding = uncombine_initial_dims(embedding2d, + torch.Size((4, 10, 20, 17, 5))) + assert list(embedding.size()) == [4, 10, 20, 17, 5, 12] + + +class TestScalarMix(unittest.TestCase): + + def test_scalar_mix_can_run_forward(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + for k in range(3): + mixture.scalar_parameters[k].data[0] = 0.1 * (k + 1) + mixture.gamma.data[0] = 0.5 + result = mixture(tensors) + + weights = [0.1, 0.2, 0.3] + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = sum(normed_weights[k] * tensors[k].data.numpy() + for k in range(3)) + expected_result *= 0.5 + numpy.testing.assert_almost_equal(expected_result, result.data.numpy()) + + def test_scalar_mix_throws_error_on_incorrect_number_of_inputs(self): + mixture = ScalarMix(3) + tensors = [torch.randn([3, 4, 5]) for _ in range(5)] + with self.assertRaises(ConfigurationError): + _ = mixture(tensors) + + def test_scalar_mix_throws_error_on_incorrect_initial_scalar_parameters_length(self): + with self.assertRaises(ConfigurationError): + ScalarMix(3, initial_scalar_parameters=[0.0, 0.0]) + + def test_scalar_mix_trainable_with_initial_scalar_parameters(self): + initial_scalar_parameters = [1.0, 2.0, 3.0] + mixture = ScalarMix(3, + initial_scalar_parameters=initial_scalar_parameters, + trainable=False) + for i, scalar_mix_parameter in enumerate(mixture.scalar_parameters): + assert scalar_mix_parameter.requires_grad is False + assert scalar_mix_parameter.item() == initial_scalar_parameters[i] + + def test_scalar_mix_layer_norm(self): + mixture = ScalarMix(3, do_layer_norm="scalar_norm_reg") + + tensors = [torch.randn([3, 4, 5]) for _ in range(3)] + numpy_mask = numpy.ones((3, 4), dtype="int32") + numpy_mask[1, 2:] = 0 + mask = torch.from_numpy(numpy_mask) + + weights = [0.1, 0.2, 0.3] + for k in range(3): + mixture.scalar_parameters[k].data[0] = weights[k] + mixture.gamma.data[0] = 0.5 + result = mixture(tensors, mask) + + normed_weights = numpy.exp(weights) / numpy.sum(numpy.exp(weights)) + expected_result = numpy.zeros((3, 4, 5)) + for k in range(3): + mean = numpy.mean(tensors[k].data.numpy()[numpy_mask == 1]) + std = numpy.std(tensors[k].data.numpy()[numpy_mask == 1]) + normed_tensor = (tensors[k].data.numpy() - mean) / (std + 1e-12) + expected_result += normed_tensor * normed_weights[k] + expected_result *= 0.5 + + numpy.testing.assert_almost_equal(expected_result, result.data.numpy(), + decimal=6) + + +if __name__ == "__main__": + unittest.main() diff --git a/texar/torch/utils/test.py b/texar/torch/utils/test.py index 26bdfe10e..50a28eb27 100644 --- a/texar/torch/utils/test.py +++ b/texar/torch/utils/test.py @@ -21,6 +21,7 @@ __all__ = [ "pretrained_test", "data_test", + "cuda_test", "external_library_test", ] @@ -35,6 +36,8 @@ def define_skip_condition(flag: str, explanation: str): 'TEST_PRETRAINED', "Test requires loading pre-trained checkpoints.") data_test = define_skip_condition( 'TEST_DATA', "Test requires loading large data files.") +cuda_test = define_skip_condition( + 'TEST_CUDA', "Test requires cuda.") def external_library_test(name: str): From d97ed254ff3eb4d0d4c123685e5b58e3275a0705 Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Fri, 21 Feb 2020 15:23:21 -0500 Subject: [PATCH 2/3] Polish elmo tokenizer utils --- .../data/tokenizers/elmo_tokenizer_utils.py | 39 +++++------ .../tokenizers/elmo_tokenizer_utils_test.py | 67 +++++-------------- 2 files changed, 34 insertions(+), 72 deletions(-) diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py index ea454d0d8..9f51168a0 100644 --- a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py @@ -30,13 +30,11 @@ ] -def _make_bos_eos( - character: int, - padding_character: int, - beginning_of_word_character: int, - end_of_word_character: int, - max_word_length: int, -): +def _make_bos_eos(character: int, + padding_character: int, + beginning_of_word_character: int, + end_of_word_character: int, + max_word_length: int): char_ids = [padding_character] * max_word_length char_ids[0] = beginning_of_word_character char_ids[1] = character @@ -86,25 +84,22 @@ def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None: def convert_word_to_char_ids(self, word: str) -> List[int]: if word in self.tokens_to_add: - char_ids = ([ELMoCharacterMapper.padding_character] * - ELMoCharacterMapper.max_word_length) - char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character char_ids[1] = self.tokens_to_add[word] - char_ids[2] = ELMoCharacterMapper.end_of_word_character - elif word == ELMoCharacterMapper.bos_token: - char_ids = ELMoCharacterMapper.beginning_of_sentence_characters - elif word == ELMoCharacterMapper.eos_token: - char_ids = ELMoCharacterMapper.end_of_sentence_characters + char_ids[2] = self.end_of_word_character + elif word == self.bos_token: + char_ids = self.beginning_of_sentence_characters + elif word == self.eos_token: + char_ids = self.end_of_sentence_characters else: - word_encoded = word.encode( - "utf-8", "ignore")[: (ELMoCharacterMapper.max_word_length - 2)] - char_ids = ([ELMoCharacterMapper.padding_character] * - ELMoCharacterMapper.max_word_length) - char_ids[0] = ELMoCharacterMapper.beginning_of_word_character + word_encoded = word.encode("utf-8", "ignore")[: ( + self.max_word_length - 2)] + char_ids = [self.padding_character] * self.max_word_length + char_ids[0] = self.beginning_of_word_character for k, chr_id in enumerate(word_encoded, start=1): char_ids[k] = chr_id - char_ids[len(word_encoded) + 1] = \ - ELMoCharacterMapper.end_of_word_character + char_ids[len(word_encoded) + 1] = self.end_of_word_character # +1 one for masking return [c + 1 for c in char_ids] diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py index f8dac6703..32e2c7a24 100644 --- a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py +++ b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Unit tests for pre-trained ELMo tokenizer. +Unit tests for the utils of pre-trained ELMo tokenizer. Code adapted from: `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py` @@ -29,75 +29,42 @@ class ELMoTokenizerUtilsTest(unittest.TestCase): def test_bos_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids('') - expected_indices = [ - 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + # [, , , , ... ] + expected_indices = [259, 257, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_eos_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids('') - expected_indices = [ - 259, 258, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 258, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_unicode_to_char_ids(self): mapper = ELMoCharacterMapper() indices = mapper.convert_word_to_char_ids(chr(256) + "t") - expected_indices = [ - 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 197, 129, 117, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_additional_tokens(self): mapper = ELMoCharacterMapper(tokens_to_add={"": 1}) indices = mapper.convert_word_to_char_ids("") - expected_indices = [ - 259, 2, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - ] + expected_indices = [259, 2, 260] + expected_indices.extend([261] * (50 - len(expected_indices))) self.assertEqual(indices, expected_indices) def test_batch_to_ids(self): sentences = [['First', 'sentence', '.'], ['Another', '.']] indices = batch_to_ids(sentences) - expected_indices = [[[ - 259, 71, 106, 115, 116, 117, 260, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ - 259, 116, 102, 111, 117, 102, 111, 100, 102, 260, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], [ - 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261]], - [[259, 66, 111, 112, 117, 105, 102, 115, 260, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], - [259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, - 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]]] + expected_indices = [[ + [259, 71, 106, 115, 116, 117, 260] + [261] * 43, + [259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40, + [259, 47, 260] + [261] * 47], [ + [259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41, + [259, 47, 260] + [261] * 47, + [0] * 50]] self.assertEqual(indices.tolist(), expected_indices) From aa641264f1653b8a7ad323321eda8a085b798c37 Mon Sep 17 00:00:00 2001 From: Pengzhi Gao Date: Fri, 21 Feb 2020 17:36:44 -0500 Subject: [PATCH 3/3] Move out some functions --- texar/torch/modules/pretrained/elmo_utils.py | 91 +------------------ .../modules/pretrained/elmo_utils_test.py | 55 ++--------- texar/torch/utils/utils.py | 72 ++++++++++++++- texar/torch/utils/utils_test.py | 50 ++++++---- 4 files changed, 112 insertions(+), 156 deletions(-) diff --git a/texar/torch/modules/pretrained/elmo_utils.py b/texar/torch/modules/pretrained/elmo_utils.py index 65b8f2f69..f2f8b729a 100644 --- a/texar/torch/modules/pretrained/elmo_utils.py +++ b/texar/torch/modules/pretrained/elmo_utils.py @@ -32,9 +32,7 @@ import json import logging -from itertools import islice -from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional, - Tuple, TypeVar, Union) +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import h5py import numpy @@ -47,6 +45,8 @@ from texar.torch.data.tokenizers.elmo_tokenizer_utils import ( batch_to_ids, ELMoCharacterMapper) +from texar.torch.utils.utils import ( + lazy_groups_of, sort_batch_by_length) # pylint: disable=attribute-defined-outside-init,protected-access @@ -66,10 +66,7 @@ "combine_initial_dims", "get_device_of", "get_dropout_mask", - "get_lengths_from_binary_sequence_mask", - "lazy_groups_of", "remove_sentence_boundaries", - "sort_batch_by_length", "uncombine_initial_dims", ] @@ -650,7 +647,7 @@ def sort_and_run_forward( batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() - sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + sequence_lengths = mask.long().sum(-1) ( sorted_inputs, sorted_sequence_lengths, @@ -1872,23 +1869,6 @@ def remove_sentence_boundaries(tensor: torch.Tensor, mask: torch.Tensor) -> \ return tensor_without_boundary_tokens, new_mask -A = TypeVar("A") - - -def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: - r"""Takes an iterable and batches the individual instances into lists of the - specified size. The last list may be smaller if there are instances left - over. - """ - iterator = iter(iterable) - while True: - s = list(islice(iterator, group_size)) - if len(s) > 0: - yield s - else: - break - - class ConfigurationError(Exception): r"""The exception raised by any AllenNLP object when it's misconfigured (e.g. missing properties, invalid properties, unknown properties). @@ -1904,69 +1884,6 @@ def __str__(self): return self.message -def get_lengths_from_binary_sequence_mask(mask: torch.Tensor): - r"""Compute sequence lengths for each batch element in a tensor using a - binary mask. - - # Parameters - - mask : torch.Tensor, required. - A 2D binary mask of shape (batch_size, sequence_length) to - calculate the per-batch sequence lengths from. - - # Returns - - A torch.LongTensor of shape (batch_size,) representing the lengths - of the sequences in the batch. - """ - return mask.long().sum(-1) - - -def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor): - r"""Sort a batch first tensor by some specified lengths. - - # Parameters - - tensor : torch.FloatTensor, required. - A batch first Pytorch tensor. - sequence_lengths : torch.LongTensor, required. - A tensor representing the lengths of some dimension of the tensor which - we want to sort by. - - # Returns - - sorted_tensor : torch.FloatTensor - The original tensor sorted along the batch dimension with respect to - sequence_lengths. - sorted_sequence_lengths : torch.LongTensor - The original sequence_lengths sorted by decreasing size. - restoration_indices : torch.LongTensor - Indices into the sorted_tensor such that - `sorted_tensor.index_select(0, restoration_indices) == original_tensor` - permutation_index : torch.LongTensor - The indices used to sort the tensor. This is useful if you want to sort - many tensors using the same ordering. - """ - - if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, - torch.Tensor): - raise ConfigurationError( - "Both the tensor and sequence lengths must be torch.Tensors.") - - sorted_sequence_lengths, permutation_index = sequence_lengths.sort( - 0, descending=True) - sorted_tensor = tensor.index_select(0, permutation_index) - - index_range = torch.arange(0, len(sequence_lengths), - device=sequence_lengths.device) - # This is the equivalent of zipping with index, sorting by the original - # sequence lengths and returning the now sorted indices. - _, reverse_mapping = permutation_index.sort(0, descending=False) - restoration_indices = index_range.index_select(0, reverse_mapping) - return (sorted_tensor, sorted_sequence_lengths, restoration_indices, - permutation_index) - - def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], gain: float = 1.0) -> None: r"""An initializer which allows initializing model parameters in "blocks". diff --git a/texar/torch/modules/pretrained/elmo_utils_test.py b/texar/torch/modules/pretrained/elmo_utils_test.py index 34d826241..38e785e8d 100644 --- a/texar/torch/modules/pretrained/elmo_utils_test.py +++ b/texar/torch/modules/pretrained/elmo_utils_test.py @@ -40,11 +40,15 @@ from texar.torch.data.data_utils import maybe_download from texar.torch.modules.pretrained.elmo_utils import ( Highway, LstmCellWithProjection, _EncoderBase, _ElmoBiLm, TimeDistributed, - sort_batch_by_length, get_lengths_from_binary_sequence_mask, remove_sentence_boundaries, add_sentence_boundary_token_ids, - lazy_groups_of, block_orthogonal, ConfigurationError, combine_initial_dims, + block_orthogonal, ConfigurationError, combine_initial_dims, uncombine_initial_dims, ScalarMix) from texar.torch.utils.test import cuda_test +from texar.torch.utils.utils import sort_batch_by_length + + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context class TestElmoBiLm(unittest.TestCase): @@ -166,7 +170,7 @@ def setUp(self): self.batch_size = 5 self.num_valid = 3 - sequence_lengths = get_lengths_from_binary_sequence_mask(mask) + sequence_lengths = mask.long().sum(-1) _, _, restoration_indices, sorting_indices = sort_batch_by_length( tensor, sequence_lengths) self.sorting_indices = sorting_indices @@ -735,51 +739,6 @@ def test_remove_sentence_boundaries(self): [[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all() - def test_lazy_groups_of(self): - xs = [1, 2, 3, 4, 5, 6, 7] - groups = lazy_groups_of(iter(xs), group_size=3) - assert next(groups) == [1, 2, 3] - assert next(groups) == [4, 5, 6] - assert next(groups) == [7] - with self.assertRaises(StopIteration): - _ = next(groups) - - def test_get_sequence_lengths_from_binary_mask(self): - binary_mask = torch.ByteTensor( - [[1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1], - [1, 0, 0, 0, 0, 0]] - ) - lengths = get_lengths_from_binary_sequence_mask(binary_mask) - numpy.testing.assert_array_equal(lengths.numpy(), - numpy.array([3, 2, 6, 1])) - - def test_sort_tensor_by_length(self): - tensor = torch.rand([5, 7, 9]) - tensor[0, 3:, :] = 0 - tensor[1, 4:, :] = 0 - tensor[2, 1:, :] = 0 - tensor[3, 5:, :] = 0 - - sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) - sorted_tensor, sorted_lengths, reverse_indices, _ = \ - sort_batch_by_length(tensor, sequence_lengths) - - # Test sorted indices are padded correctly. - numpy.testing.assert_array_equal( - sorted_tensor[1, 5:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[2, 4:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[3, 3:, :].data.numpy(), 0.0) - numpy.testing.assert_array_equal( - sorted_tensor[4, 1:, :].data.numpy(), 0.0) - - assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) - - # Test restoration indices correctly recover the original tensor. - assert sorted_tensor.index_select(0, reverse_indices).data.equal( - tensor.data) - def test_block_orthogonal_can_initialize(self): tensor = torch.zeros([10, 6]) block_orthogonal(tensor, [5, 3]) diff --git a/texar/torch/utils/utils.py b/texar/torch/utils/utils.py index 426081587..68c4b833e 100644 --- a/texar/torch/utils/utils.py +++ b/texar/torch/utils/utils.py @@ -19,10 +19,12 @@ import copy import inspect from functools import lru_cache +from itertools import islice from pydoc import locate from typing import ( - Any, Callable, Collection, Dict, List, MutableMapping, Optional, Sequence, - Tuple, Type, TypeVar, Union, cast, no_type_check, overload) + Any, Callable, Collection, Dict, Iterable, Iterator, List, MutableMapping, + Optional, Sequence, Tuple, Type, TypeVar, Union, cast, no_type_check, + overload) import funcsigs import numpy as np @@ -67,6 +69,8 @@ 'uniquify_str', 'ceildiv', 'sum_tensors', + 'lazy_groups_of', + 'sort_batch_by_length', ] T = TypeVar('T') # type argument @@ -1196,3 +1200,67 @@ def truncate_seq_pair(tokens_a: Union[List[int], List[str]], tokens_a.pop() else: tokens_b.pop() + + +A = TypeVar("A") + + +def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]: + r"""Takes an iterable and batches the individual instances into lists of the + specified size. The last list may be smaller if there are instances left + over. + + Args: + iterable: An iterable object. + group_size: The group size. + + Returns: + An iterator. + """ + iterator = iter(iterable) + while True: + s = list(islice(iterator, group_size)) + if len(s) > 0: + yield s + else: + break + + +def sort_batch_by_length(tensor: torch.Tensor, + sequence_lengths: torch.Tensor) -> \ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Sort a batch first tensor by some specified lengths. + + Args: + tensor: A batch first tensor. + sequence_lengths: A tensor representing the lengths of some dimension of + the tensor which we want to sort by. + + Returns: + sorted_tensor: The original tensor sorted along the batch dimension + with respect to `sequence_lengths`. + sorted_sequence_lengths: The original `sequence_lengths` sorted by + decreasing size. + restoration_indices: Indices into the `sorted_tensor` such that + ``sorted_tensor.index_select(0, restoration_indices) == + original_tensor`` + permutation_index: The indices used to sort the tensor. This is useful + if you want to sort many tensors using the same ordering. + """ + if not isinstance(tensor, torch.Tensor) or \ + not isinstance(sequence_lengths, torch.Tensor): + raise ValueError( + "Both the tensor and sequence lengths must be torch.Tensors.") + + sorted_sequence_lengths, permutation_index = sequence_lengths.sort( + 0, descending=True) + sorted_tensor = tensor.index_select(0, permutation_index) + + index_range = torch.arange(0, len(sequence_lengths), + device=sequence_lengths.device) + # This is the equivalent of zipping with index, sorting by the original + # sequence lengths and returning the now sorted indices. + _, reverse_mapping = permutation_index.sort(0, descending=False) + restoration_indices = index_range.index_select(0, reverse_mapping) + return (sorted_tensor, sorted_sequence_lengths, restoration_indices, + permutation_index) diff --git a/texar/torch/utils/utils_test.py b/texar/torch/utils/utils_test.py index 2eb543a77..122a16276 100644 --- a/texar/torch/utils/utils_test.py +++ b/texar/torch/utils/utils_test.py @@ -195,25 +195,37 @@ def test_truncate_seq_pair(self): self.assertListEqual(tokens_a, [1]) self.assertListEqual(tokens_b, [2, 3]) - # def test_map_ids_to_strs(self): - # """Tests :func:`texar.torch.utils.map_ids_to_strs`. - # """ - # vocab_list = ['word', '词'] - # vocab_file = tempfile.NamedTemporaryFile() - # vocab_file.write('\n'.join(vocab_list).encode("utf-8")) - # vocab_file.flush() - # vocab = Vocab(vocab_file.name) - - # text = [['', 'word', '词', '', ''], - # ['word', '词', 'word', '词', '']] - # text = np.asarray(text) - # ids = vocab.map_tokens_to_ids_py(text) - - # ids = ids.tolist() - # text_ = utils.map_ids_to_strs(ids, vocab) - - # self.assertEqual(text_[0], 'word 词') - # self.assertEqual(text_[1], 'word 词 word 词') + def test_lazy_groups_of(self): + xs = [1, 2, 3, 4, 5, 6, 7] + groups = utils.lazy_groups_of(iter(xs), group_size=3) + assert next(groups) == [1, 2, 3] + assert next(groups) == [4, 5, 6] + assert next(groups) == [7] + with self.assertRaises(StopIteration): + _ = next(groups) + + def test_sort_batch_by_length(self): + tensor = torch.rand([5, 7, 9]) + tensor[0, 3:, :] = 0 + tensor[1, 4:, :] = 0 + tensor[2, 1:, :] = 0 + tensor[3, 5:, :] = 0 + + sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) + sorted_tensor, sorted_lengths, reverse_indices, _ = \ + utils.sort_batch_by_length(tensor, sequence_lengths) + + # Test sorted indices are padded correctly. + np.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0) + np.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0) + + assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) + + # Test restoration indices correctly recover the original tensor. + assert sorted_tensor.index_select(0, reverse_indices).data.equal( + tensor.data) if __name__ == "__main__":