Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ELMo modules #298

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ numpy >= 1.15.4
mypy_extensions >= 0.4.1
regex >= 2018.01.10
sentencepiece >= 0.1.8
h5py >= 2.10.0
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
install_requires=[
'regex>=2018.01.10',
'numpy',
'h5py>=2.10.0',
'requests',
'funcsigs',
'sentencepiece>=0.1.8',
Expand Down
131 changes: 131 additions & 0 deletions texar/torch/data/tokenizers/elmo_tokenizer_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utils of pre-trained ELMo tokenizer.

Code adapted from:
`https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/elmo_indexer.py`
"""
from typing import Dict, List, Optional

import torch

from torch.nn.utils.rnn import pad_sequence


__all__ = [
"ELMoCharacterMapper",
"batch_to_ids",
]


def _make_bos_eos(character: int,
padding_character: int,
beginning_of_word_character: int,
end_of_word_character: int,
max_word_length: int):
char_ids = [padding_character] * max_word_length
char_ids[0] = beginning_of_word_character
char_ids[1] = character
char_ids[2] = end_of_word_character
return char_ids


class ELMoCharacterMapper:
r"""Maps individual tokens to sequences of character ids, compatible with
ELMo. To be consistent with previously trained models, we include it here as
special of existing character indexers.

We allow to add optional additional special tokens with designated
character ids with `tokens_to_add`.
"""

max_word_length = 50

# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
beginning_of_sentence_character = 256 # <begin sentence>
end_of_sentence_character = 257 # <end sentence>
beginning_of_word_character = 258 # <begin word>
end_of_word_character = 259 # <end word>
padding_character = 260 # <padding>

beginning_of_sentence_characters = _make_bos_eos(
beginning_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)
end_of_sentence_characters = _make_bos_eos(
end_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)

bos_token = "<S>"
eos_token = "</S>"

def __init__(self, tokens_to_add: Optional[Dict[str, int]] = None) -> None:
self.tokens_to_add = tokens_to_add or {}

def convert_word_to_char_ids(self, word: str) -> List[int]:
if word in self.tokens_to_add:
char_ids = [self.padding_character] * self.max_word_length
char_ids[0] = self.beginning_of_word_character
char_ids[1] = self.tokens_to_add[word]
char_ids[2] = self.end_of_word_character
elif word == self.bos_token:
char_ids = self.beginning_of_sentence_characters
elif word == self.eos_token:
char_ids = self.end_of_sentence_characters
else:
word_encoded = word.encode("utf-8", "ignore")[: (
self.max_word_length - 2)]
char_ids = [self.padding_character] * self.max_word_length
char_ids[0] = self.beginning_of_word_character
for k, chr_id in enumerate(word_encoded, start=1):
char_ids[k] = chr_id
char_ids[len(word_encoded) + 1] = self.end_of_word_character

# +1 one for masking
return [c + 1 for c in char_ids]

def __eq__(self, other) -> bool:
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return NotImplemented


def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
r"""Converts a batch of tokenized sentences to a tensor representing the
sentences with encoded characters (len(batch), max sentence length,
max word length).

Args:
batch: A list of tokenized sentences.

Returns:
A tensor of padded character ids.
"""
res = []
mapper = ELMoCharacterMapper()
for sentence in batch:
character_ids = [mapper.convert_word_to_char_ids(token)
for token in sentence]
res.append(torch.tensor(character_ids))

return pad_sequence(res, batch_first=True)
72 changes: 72 additions & 0 deletions texar/torch/data/tokenizers/elmo_tokenizer_utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2019 The Texar Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Unit tests for the utils of pre-trained ELMo tokenizer.

Code adapted from:
`https://github.com/allenai/allennlp/blob/master/allennlp/tests/data/token_indexers/elmo_indexer_test.py`
"""

import unittest

from texar.torch.data.tokenizers.elmo_tokenizer_utils import (
ELMoCharacterMapper, batch_to_ids)


class ELMoTokenizerUtilsTest(unittest.TestCase):

def test_bos_to_char_ids(self):
mapper = ELMoCharacterMapper()
indices = mapper.convert_word_to_char_ids('<S>')
# [<begin word>, <begin sentence>, <end word>, <padding>, ... <padding>]
expected_indices = [259, 257, 260]
expected_indices.extend([261] * (50 - len(expected_indices)))
self.assertEqual(indices, expected_indices)

def test_eos_to_char_ids(self):
mapper = ELMoCharacterMapper()
indices = mapper.convert_word_to_char_ids('</S>')
expected_indices = [259, 258, 260]
expected_indices.extend([261] * (50 - len(expected_indices)))
self.assertEqual(indices, expected_indices)

def test_unicode_to_char_ids(self):
mapper = ELMoCharacterMapper()
indices = mapper.convert_word_to_char_ids(chr(256) + "t")
expected_indices = [259, 197, 129, 117, 260]
expected_indices.extend([261] * (50 - len(expected_indices)))
self.assertEqual(indices, expected_indices)

def test_additional_tokens(self):
mapper = ELMoCharacterMapper(tokens_to_add={"<first>": 1})
indices = mapper.convert_word_to_char_ids("<first>")
expected_indices = [259, 2, 260]
expected_indices.extend([261] * (50 - len(expected_indices)))
self.assertEqual(indices, expected_indices)

def test_batch_to_ids(self):
sentences = [['First', 'sentence', '.'], ['Another', '.']]
indices = batch_to_ids(sentences)
expected_indices = [[
[259, 71, 106, 115, 116, 117, 260] + [261] * 43,
[259, 116, 102, 111, 117, 102, 111, 100, 102, 260] + [261] * 40,
[259, 47, 260] + [261] * 47], [
[259, 66, 111, 112, 117, 105, 102, 115, 260] + [261] * 41,
[259, 47, 260] + [261] * 47,
[0] * 50]]
self.assertEqual(indices.tolist(), expected_indices)


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions texar/torch/modules/encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from texar.torch.modules.encoders.bert_encoder import *
from texar.torch.modules.encoders.conv_encoders import *
from texar.torch.modules.encoders.elmo_encoder import *
from texar.torch.modules.encoders.encoder_base import *
from texar.torch.modules.encoders.gpt2_encoder import *
from texar.torch.modules.encoders.multihead_attention import *
Expand Down
Loading