asyml · gpengzhi · Feb 19, 2020 · Feb 21, 2020 · Feb 21, 2020
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ numpy >= 1.15.4
 mypy_extensions >= 0.4.1
 regex >= 2018.01.10
 sentencepiece >= 0.1.8
+h5py >= 2.10.0
diff --git a/setup.py b/setup.py
@@ -33,6 +33,7 @@
     install_requires=[
         'regex>=2018.01.10',
         'numpy',
+        'h5py>=2.10.0',
         'requests',
         'funcsigs',
         'sentencepiece>=0.1.8',

diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils.py
@@ -0,0 +1,131 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils of pre-trained ELMo tokenizer.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py`
+"""
+from typing import Dict, List, Optional
+
+import torch
+
+from torch.nn.utils.rnn import pad_sequence
+
+
+__all__ = [
+    "ELMoCharacterMapper",
+    "batch_to_ids",
+]
+
+
+def _make_bos_eos(character: int,
+                  padding_character: int,
+                  beginning_of_word_character: int,
+                  end_of_word_character: int,
+                  max_word_length: int):
+    char_ids = [padding_character] * max_word_length
+    char_ids[0] = beginning_of_word_character
+    char_ids[1] = character
+    char_ids[2] = end_of_word_character
+    return char_ids
+
+
+class ELMoCharacterMapper:
+    r"""Maps individual tokens to sequences of character ids, compatible with
+    ELMo. To be consistent with previously trained models, we include it here as
+    special of existing character indexers.
+
+    We allow to add optional additional special tokens with designated
+    character ids with `tokens_to_add`.
+    """
+
+    max_word_length = 50
+
+    # char ids 0-255 come from utf-8 encoding bytes
+    # assign 256-300 to special chars
+    beginning_of_sentence_character = 256  # <begin sentence>
+    end_of_sentence_character = 257  # <end sentence>
+    beginning_of_word_character = 258  # <begin word>
+    end_of_word_character = 259  # <end word>
+    padding_character = 260  # <padding>
+
+    beginning_of_sentence_characters = _make_bos_eos(
+        beginning_of_sentence_character,
+        padding_character,
+        beginning_of_word_character,
+        end_of_word_character,
+        max_word_length,
+    )
+    end_of_sentence_characters = _make_bos_eos(
+        end_of_sentence_character,
+        padding_character,
+        beginning_of_word_character,
+        end_of_word_character,
+        max_word_length,
+    )
+
+    bos_token = "<S>"
+    eos_token = "</S>"
+
+    def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None:
+        self.tokens_to_add = tokens_to_add or {}
+
+    def convert_word_to_char_ids(self, word: str) -> List[int]:
+        if word in self.tokens_to_add:
+            char_ids = [self.padding_character] * self.max_word_length
+            char_ids[0] = self.beginning_of_word_character
+            char_ids[1] = self.tokens_to_add[word]
+            char_ids[2] = self.end_of_word_character
+        elif word == self.bos_token:
+            char_ids = self.beginning_of_sentence_characters
+        elif word == self.eos_token:
+            char_ids = self.end_of_sentence_characters
+        else:
+            word_encoded = word.encode("utf-8", "ignore")[: (
+                    self.max_word_length - 2)]
+            char_ids = [self.padding_character] * self.max_word_length
+            char_ids[0] = self.beginning_of_word_character
+            for k, chr_id in enumerate(word_encoded, start=1):
+                char_ids[k] = chr_id
+            char_ids[len(word_encoded) + 1] = self.end_of_word_character
+
+        # +1 one for masking
+        return [c + 1 for c in char_ids]
+
+    def __eq__(self, other) -> bool:
+        if isinstance(self, other.__class__):
+            return self.__dict__ == other.__dict__
+        return NotImplemented
+
+
+def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
+    r"""Converts a batch of tokenized sentences to a tensor representing the
+    sentences with encoded characters (len(batch), max sentence length,
+    max word length).
+
+    Args:
+        batch: A list of tokenized sentences.
+
+    Returns:
+        A tensor of padded character ids.
+    """
+    res = []
+    mapper = ELMoCharacterMapper()
+    for sentence in batch:
+        character_ids = [mapper.convert_word_to_char_ids(token)
+                         for token in sentence]
+        res.append(torch.tensor(character_ids))
+
+    return pad_sequence(res, batch_first=True)
diff --git a/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py b/texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py
@@ -0,0 +1,72 @@
+# Copyright 2019 The Texar Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unit tests for the utils of pre-trained ELMo tokenizer.
+
+Code adapted from:
+    `https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py`
+"""
+
+import unittest
+
+from texar.torch.data.tokenizers.elmo_tokenizer_utils import (
+    ELMoCharacterMapper, batch_to_ids)
+
+
+class ELMoTokenizerUtilsTest(unittest.TestCase):
+
+    def test_bos_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids('<S>')
+        # [<begin word>, <begin sentence>, <end word>, <padding>, ... <padding>]
+        expected_indices = [259, 257, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_eos_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids('</S>')
+        expected_indices = [259, 258, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_unicode_to_char_ids(self):
+        mapper = ELMoCharacterMapper()
+        indices = mapper.convert_word_to_char_ids(chr(256) + "t")
+        expected_indices = [259, 197, 129, 117, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_additional_tokens(self):
+        mapper = ELMoCharacterMapper(tokens_to_add={"<first>": 1})
+        indices = mapper.convert_word_to_char_ids("<first>")
+        expected_indices = [259, 2, 260]
+        expected_indices.extend([261] * (50 - len(expected_indices)))
+        self.assertEqual(indices, expected_indices)
+
+    def test_batch_to_ids(self):
+        sentences = [['First', 'sentence', '.'], ['Another', '.']]
+        indices = batch_to_ids(sentences)
+        expected_indices = [[
+            [259,  71, 106, 115, 116, 117, 260] + [261] * 43,
+            [259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40,
+            [259,  47, 260] + [261] * 47], [
+            [259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41,
+            [259, 47, 260] + [261] * 47,
+            [0] * 50]]
+        self.assertEqual(indices.tolist(), expected_indices)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/texar/torch/modules/encoders/__init__.py b/texar/torch/modules/encoders/__init__.py
@@ -17,6 +17,7 @@
 
 from texar.torch.modules.encoders.bert_encoder import *
 from texar.torch.modules.encoders.conv_encoders import *
+from texar.torch.modules.encoders.elmo_encoder import *
 from texar.torch.modules.encoders.encoder_base import *
 from texar.torch.modules.encoders.gpt2_encoder import *
 from texar.torch.modules.encoders.multihead_attention import *