forked from huggingface/torchMoji
-
Notifications
You must be signed in to change notification settings - Fork 5
/
create_vocab.py
271 lines (217 loc) · 9.52 KB
/
create_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import glob
import json
import uuid
from copy import deepcopy
from collections import defaultdict, OrderedDict
import numpy as np
from torchmoji.filter_utils import is_special_token
from torchmoji.word_generator import WordGenerator
from torchmoji.global_variables import SPECIAL_TOKENS, VOCAB_PATH
class VocabBuilder():
""" Create vocabulary with words extracted from sentences as fed from a
word generator.
"""
def __init__(self, word_gen):
# initialize any new key with value of 0
self.word_counts = defaultdict(lambda: 0, {})
self.word_length_limit=30
for token in SPECIAL_TOKENS:
assert len(token) < self.word_length_limit
self.word_counts[token] = 0
self.word_gen = word_gen
def count_words_in_sentence(self, words):
""" Generates word counts for all tokens in the given sentence.
# Arguments:
words: Tokenized sentence whose words should be counted.
"""
for word in words:
if 0 < len(word) and len(word) <= self.word_length_limit:
try:
self.word_counts[word] += 1
except KeyError:
self.word_counts[word] = 1
def save_vocab(self, path=None):
""" Saves the vocabulary into a file.
# Arguments:
path: Where the vocabulary should be saved. If not specified, a
randomly generated filename is used instead.
"""
dtype = ([('word','|S{}'.format(self.word_length_limit)),('count','int')])
np_dict = np.array(self.word_counts.items(), dtype=dtype)
# sort from highest to lowest frequency
np_dict[::-1].sort(order='count')
data = np_dict
if path is None:
path = str(uuid.uuid4())
np.savez_compressed(path, data=data)
print("Saved dict to {}".format(path))
def get_next_word(self):
""" Returns next tokenized sentence from the word geneerator.
# Returns:
List of strings, representing the next tokenized sentence.
"""
return self.word_gen.__iter__().next()
def count_all_words(self):
""" Generates word counts for all words in all sentences of the word
generator.
"""
for words, _ in self.word_gen:
self.count_words_in_sentence(words)
class MasterVocab():
""" Combines vocabularies.
"""
def __init__(self):
# initialize custom tokens
self.master_vocab = {}
def populate_master_vocab(self, vocab_path, min_words=1, force_appearance=None):
""" Populates the master vocabulary using all vocabularies found in the
given path. Vocabularies should be named *.npz. Expects the
vocabularies to be numpy arrays with counts. Normalizes the counts
and combines them.
# Arguments:
vocab_path: Path containing vocabularies to be combined.
min_words: Minimum amount of occurences a word must have in order
to be included in the master vocabulary.
force_appearance: Optional vocabulary filename that will be added
to the master vocabulary no matter what. This vocabulary must
be present in vocab_path.
"""
paths = glob.glob(vocab_path + '*.npz')
sizes = {path: 0 for path in paths}
dicts = {path: {} for path in paths}
# set up and get sizes of individual dictionaries
for path in paths:
np_data = np.load(path)['data']
for entry in np_data:
word, count = entry
if count < min_words:
continue
if is_special_token(word):
continue
dicts[path][word] = count
sizes[path] = sum(dicts[path].values())
print('Overall word count for {} -> {}'.format(path, sizes[path]))
print('Overall word number for {} -> {}'.format(path, len(dicts[path])))
vocab_of_max_size = max(sizes, key=sizes.get)
max_size = sizes[vocab_of_max_size]
print('Min: {}, {}, {}'.format(sizes, vocab_of_max_size, max_size))
# can force one vocabulary to always be present
if force_appearance is not None:
force_appearance_path = [p for p in paths if force_appearance in p][0]
force_appearance_vocab = deepcopy(dicts[force_appearance_path])
print(force_appearance_path)
else:
force_appearance_path, force_appearance_vocab = None, None
# normalize word counts before inserting into master dict
for path in paths:
normalization_factor = max_size / sizes[path]
print('Norm factor for path {} -> {}'.format(path, normalization_factor))
for word in dicts[path]:
if is_special_token(word):
print("SPECIAL - ", word)
continue
normalized_count = dicts[path][word] * normalization_factor
# can force one vocabulary to always be present
if force_appearance_vocab is not None:
try:
force_word_count = force_appearance_vocab[word]
except KeyError:
continue
#if force_word_count < 5:
#continue
if word in self.master_vocab:
self.master_vocab[word] += normalized_count
else:
self.master_vocab[word] = normalized_count
print('Size of master_dict {}'.format(len(self.master_vocab)))
print("Hashes for master dict: {}".format(
len([w for w in self.master_vocab if '#' in w[0]])))
def save_vocab(self, path_count, path_vocab, word_limit=100000):
""" Saves the master vocabulary into a file.
"""
# reserve space for 10 special tokens
words = OrderedDict()
for token in SPECIAL_TOKENS:
# store -1 instead of np.inf, which can overflow
words[token] = -1
# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)
# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
dtype=([('word','|S30'),('count','float')]))
# output count for debugging
counts = np_vocab[:word_limit]
np.savez_compressed(path_count, counts=counts)
# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
final_words.update({w:i})
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
def all_words_in_sentences(sentences):
""" Extracts all unique words from a given list of sentences.
# Arguments:
sentences: List or word generator of sentences to be processed.
# Returns:
List of all unique words contained in the given sentences.
"""
vocab = []
if isinstance(sentences, WordGenerator):
sentences = [s for s, _ in sentences]
for sentence in sentences:
for word in sentence:
if word not in vocab:
vocab.append(word)
return vocab
def extend_vocab_in_file(vocab, max_tokens=10000, vocab_path=VOCAB_PATH):
""" Extends JSON-formatted vocabulary with words from vocab that are not
present in the current vocabulary. Adds up to max_tokens words.
Overwrites file in vocab_path.
# Arguments:
new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e.
must have run count_all_words() previously.
max_tokens: Maximum number of words to be added.
vocab_path: Path to the vocabulary json which is to be extended.
"""
try:
with open(vocab_path, 'r') as f:
current_vocab = json.load(f)
except IOError:
print('Vocabulary file not found, expected at ' + vocab_path)
return
extend_vocab(current_vocab, vocab, max_tokens)
# Save back to file
with open(vocab_path, 'w') as f:
json.dump(current_vocab, f, sort_keys=True, indent=4, separators=(',',': '))
def extend_vocab(current_vocab, new_vocab, max_tokens=10000):
""" Extends current vocabulary with words from vocab that are not
present in the current vocabulary. Adds up to max_tokens words.
# Arguments:
current_vocab: Current dictionary of tokens.
new_vocab: Vocabulary to be added. MUST have word_counts populated, i.e.
must have run count_all_words() previously.
max_tokens: Maximum number of words to be added.
# Returns:
How many new tokens have been added.
"""
if max_tokens < 0:
max_tokens = 10000
words = OrderedDict()
# sort words by frequency
desc_order = OrderedDict(sorted(new_vocab.word_counts.items(),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)
base_index = len(current_vocab.keys())
added = 0
for word in words:
if added >= max_tokens:
break
if word not in current_vocab.keys():
current_vocab[word] = base_index + added
added += 1
return added