-
Notifications
You must be signed in to change notification settings - Fork 0
/
elmo.py
50 lines (43 loc) · 1.85 KB
/
elmo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
Defines a module that generates ELMo embeddings for input sentences
"""
import numpy as np
import torch
from allennlp.modules.elmo import Elmo as allen_elmo
from allennlp.modules.elmo import batch_to_ids
from torch import nn
class Elmo(nn.Module):
def __init__(self, device):
""" Load the ELMo model. The first time you run this, it will download a pretrained model. """
super(Elmo, self).__init__()
options = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weights = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
# initialize instance of Elmo model
self.elmo = allen_elmo(options, weights, 2, dropout=0)
self._dev = device
self.to(self._dev)
def forward(self, batch):
"""
:param batch: List of tokenized sentences of varying lengths
:return: Embeddings of dimension (batch, seq_len, embedding_dim), where seq_len is the max-length
"""
# embeddings['elmo_representations'] is length two list of tensors.
# Each element contains one layer of ELMo representations with shape
# (batch, seq_len, embedding_dim).
char_ids = batch_to_ids(batch).to(self._dev)
embeddings = self.elmo(char_ids)
return embeddings['elmo_representations'][-1]
def get_elmo_vectors(sentences):
"""
Save all elmo vectors in a corpus to a file
:param sentences: list of list of tokens
:return:
"""
token_lens = [len(s) for s in sentences]
np.save("lens_test.npy", token_lens)
print("saved lengths")
elmo = Elmo()
embeddings = elmo(sentences)
embeddings = embeddings.detach().numpy()
np.save("elmo_test.npy", embeddings)
print("finished")