-
Notifications
You must be signed in to change notification settings - Fork 3
/
expander.py
452 lines (407 loc) · 18.4 KB
/
expander.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Module for expanding contractions in english text. """
__author__ = "Yannick Couzinié"
# standard library imports
import itertools
import operator
import yaml
# third party library imports
import nltk
# local imports
import utils
def _extract_contractions(sent):
"""
Args:
sent - a single sentence split up into (word, pos) tuples.
Returns:
List with the indices in the sentence where the contraction
starts.
Or None if no contractions are in the sentence.
Based on the POS-tags and the existence of an apostrophe or not,
extract the existing contractions.
"""
idx_lst = []
for i, word_pos in enumerate(sent):
# If the word in the word_pos tuple begins with an apostrophe,
# add the index to idx_list.
if word_pos[0][0] == "'":
if word_pos[1] != 'POS':
# POS stands for possessive pronoun
idx_lst.append(i)
elif word_pos[0] == "n't":
# n't is treated extraordinarily and added explicitly
idx_lst.append(i)
if idx_lst:
return idx_lst
def _consecutive_sub_list(int_list):
"""
Args:
- int_list is a list whose consecutive sub-lists are yielded
from this function.
Yields:
- The consecutive sub-lists
This is basically an adaptation from
https://docs.python.org/2.6/library/itertools.html#examples for
Python 3.
"""
# we group the items by using the lambda-function for the key which
# checks whether the next element and the current element is one
# apart. If it it is exactly one, the list of items that are 1 apart
# are grouped.
# The map with the itemgetter then maps the grouping to the actual
# items and then we yield the sublists.
for _, index in itertools.groupby(enumerate(int_list),
lambda x: x[1]-x[0]):
yield list(map(operator.itemgetter(1), index))
def _return_replacement(inp_tuple, argmax, disambiguations):
"""
Args:
- inp_tuple = the input tuple which needs to be counterchecked with the
dictionary.
- disambiguations = dictionary with all the replacements.
- argmax = boolean of whether to take the argmax or not, in case of
ambiguous cases.
Returns:
- The recommended replacements as stored in the values of
disambiguations. It returns None in case of no replacements.
This function strictly serves to return the replacements for ambiguous
cases, i.e. as stored in disambiguations.yaml.
"""
if inp_tuple in disambiguations:
if len(disambiguations[inp_tuple].keys()) == 1:
# if this is unambiguous just handle it
return list(disambiguations[inp_tuple])[0]
else:
if not argmax:
# if one should not take the argmax just replace nothing. This
# is not recommended, but in the future it might be interesting
# to differentiate the cases.
return None
# if it is ambiguous find the case with the most occurences
max_val = max(disambiguations[inp_tuple].values())
if list(disambiguations[inp_tuple].values()).count(max_val) == 1:
# if there is exactly one replacement with the highest
# value, choose that
for key, value in disambiguations[inp_tuple].items():
if value == max_val:
return key
else:
# if it is still ambigious just stop at this point and
# work on the disambiguations dictionary.
return None
else:
# if the case is not even in the dictionary just skip it and
# work on the disambiguations dictionary.
return None
def _disambiguate(sent, rplc_tuple, disambiguations, add_tags,
argmax=True):
"""
Args:
- sent is the same sentence as in rplc_tuple but with the
pos_tags.
- rplc_tuple is the tuple containint the index of replacement,
the suggested replacements and the sentence.
- disambiguations dictionary
- add_tags is the amount of additional tags in the disambi
- in case the disambiguation case is also ambiguous use the case
with more occurences in the corpus. If that still doesn't help
don't change the input.
Returns:
- the expanded sentence (as far as unambiguous).
Use the disambiguation dictionary to disambiguate the expansions.
"""
# first we need to check again whether the first word is capitalized
# a special case is when the first sign actually is an apostrophe like 't
# (in 'tis)
# so check wheter one of the first two characters is upper and make sure
# that it is not the ner_tag
if ((sent[0][0][0].isupper() or sent[0][0][1].isupper())
and sent[0][0] != "<NE>"):
capitalized = True
sent[0] = (sent[0][0].lower(), sent[0][1])
else:
capitalized = False
# make the input tuple which is of the form of the dictionary keys
inp_tuple = [sent[i] for i in rplc_tuple[0]]
# append the pos tags for the rest
inp_tuple += [sent[i][1] for i in range(rplc_tuple[0][-1]+1,
rplc_tuple[0][-1]+1+add_tags)]
inp_tuple = tuple(inp_tuple)
# analyze disambiguations for the correct replacement
replacement = _return_replacement(inp_tuple, argmax, disambiguations)
# now do the replacements
sent = _remove_pos_tags(sent)
if replacement is not None:
for i, index in enumerate(rplc_tuple[0]):
sent[index] = replacement.split()[i]
if capitalized:
sent[0] = sent[0].title()
return sent
def _check_if_contr_in_dict(consecutive, sent, contractions):
"""
Args:
- consecutive = a list of consecutive indices at which sent contains
contractions.
- sent = a (word, pos_tag) list, whereby the words make up a sentence.
- contractions = the contractions dictionary.
Returns:
- the list of possible expansions.
Raises:
- ValueError if the contractions have questionable capitalization,
which will not be reproduced upon expansion since that would be too
cumbersome.
"""
# combine all the words that are expanded, i.e. one word
# before the first apostrophe until the last one with an
# apostrophe
contr = [word_pos[0] for word_pos
in sent[consecutive[0]:consecutive[-1]+1]]
# if the expanded string is one of the known contractions,
# extract the suggested expansions.
# Note that however many expansions there are, expanded is a list!
if ''.join(contr) in contractions:
expanded = contractions[''.join(contr)]
# the dictionary only contains non-capitalized replacements,
# check for capitalization
elif ''.join(contr).lower() in contractions:
if ''.join(contr)[0].isupper() or ''.join(contr)[1].isupper():
# capitalize the replacement in this case
expanded = [a.capitalize() for a in
contractions[''.join(contr).lower()]]
else:
raise ValueError("Weird capitalization error! Please use standard "
"english grammar.")
else:
# if the replacement is unknown skip to the next one
return None, contr
return expanded, contr
def _extract_replacements(idx_lst, sent, contractions):
"""
Args:
idx_lst - The list of indices for the position of contractions
in sent.
sent - List of (word, pos) tuples.
contractions - dictionary of contractions in the form of:
'contracted string' : 'list of possible
replacements'
Returns:
A list in the form of (tuples of (index of words to be replaced,
word to be replaced,
list of suggested replacements))
Examples are: ([0,1], ["I", "'m"], ["I", "am"])
([0,1], ["She", "'s"], [["She", "is"], ["She", "has"]])
Based on the idx_lst and the contractions dictionary, give a list of
replacements which shall be performed on the words in sent.
"""
# loop over all the consecutive parts
for consecutive in _consecutive_sub_list(idx_lst):
# first test the consecutive list like this
expanded, contr = _check_if_contr_in_dict(consecutive,
sent,
contractions)
if expanded is None:
# add the one index prior to the first one for easier
consecutive = [consecutive[0]-1] + consecutive
expanded, contr = _check_if_contr_in_dict(consecutive,
sent,
contractions)
if expanded is None:
print("WARNING: Unknown replacement: ", ''.join(contr))
expanded = []
# separate the phrases into their respective words again.
if "<NE>" in expanded[0]:
# insert a random name (here the name of a more or less famous
# japanese female head of the Ii family) to avoid <NE> being split.
expanded = [exp.replace("<NE>", "Naotora") for exp in expanded]
expanded = [nltk.word_tokenize(a) for a in expanded]
for i, _sent in enumerate(expanded):
for j in [k for k, x in enumerate(_sent) if x == "Naotora"]:
expanded[i][j] = "<NE>"
else:
expanded = [nltk.word_tokenize(a) for a in expanded]
yield (consecutive, contr, expanded)
def _remove_pos_tags(sent):
"""
Args:
sent - list of (word, pos) tuples
Returns:
A list of only lexical items.
Convert a list of (word, pos) tuples back to a list of only words.
"""
output = []
for word_pos in sent:
output.append(word_pos[0])
return output
def _do_replacements(sent, idx_lst, add_tags, contractions, disambiguations):
tmp = _remove_pos_tags(sent)
# only do something if there are any replacements
if idx_lst is None:
return tmp
# evaluate the needed replacements, and loop over them
for rplc_tuple in _extract_replacements(idx_lst,
sent,
contractions):
# if the replacement is unambiguous, do it.
if len(rplc_tuple[2]) == 1:
if len(rplc_tuple[1]) == len(rplc_tuple[2][0]):
# check that there is the exact amount of words to be
# replaced
for i, index in enumerate(rplc_tuple[0]):
tmp[index] = rplc_tuple[2][0][i]
else:
for i, word in enumerate(rplc_tuple[2][0]):
if i >= len(rplc_tuple[0]):
# if the replacing string is longer than the
# original text, we need to move all the elements
# back to fit the new words in.
# save the good text since it is not to be replaced
tmp2 = tmp[rplc_tuple[0][0]+i:]
# delete anything after the last replacement
del tmp[rplc_tuple[0][0]+i:]
# append to next words in the replacements, since from
# now on every word will need to be treated like this
tmp += word[i:]
# add the good bits again
tmp += tmp2
break
else:
# otherwise just replace
tmp[rplc_tuple[0][0]+i] = word
if len(rplc_tuple[2][0]) < len(rplc_tuple[0]):
# if there is less to replace than there originally
# was, remove anything that was not touched
del tmp[rplc_tuple[0][0]+len(rplc_tuple[2][0]):
rplc_tuple[0][-1]+1]
else:
# else deal with the ambiguous case
tmp = _disambiguate(sent, rplc_tuple,
disambiguations, add_tags)
return tmp
def expand_contractions(stanford_model,
sent_list,
is_split=True,
use_ner=False,
ner_args=None):
"""
Args:
stanford_model - object of StanfordPOSTagger, as returned by
load_stanford_pos.
sent_list - list of sentences which are split up by word.
For the splitting use nltk.word_tokenize.
is_split - boolean to track whether splitting has to be done
or not. If it has to be done provide sentences as
single strings.
use_ner - boolean to decide whether to use
named-entity-recognition for a potential increase in
accuracy but with the obvious costs of performance.
ner_args - is a list with an object of StanfordNERTagger and
the tag to be used. This only needs to be
supplied if use_ner is true.
Returns:
sent_list with expanded contractions.
Raises:
ValueError if use_ner is True but no ner_model is supplied.
This method uses the StanfordPOSTagger tags to identify contractions in
the sentence and expand them sensibly. Some examples are:
"I'm" -> "I am"
"It's difficult" -> "It is difficult"
The difficulty is that sometimes "'s" is not an indicator of a
contraction but a possessive pronoun like
"It's legs were shaking"
which should not be expanded. The stanford tagger tags this as
"POS" for possessive which makes it easy to identify these cases.
Furthermore, a difficulty lies in the fact that the expansion is not
unique. Without context we have for example the following:
"I'll" -> "I will" or "I shall"
"""
if use_ner and (ner_args is None):
raise ValueError("The use_ner flag is True but no NER"
" model has been supplied!")
with open("contractions.yaml", "r") as stream:
# load the dictionary containing all the contractions
contractions = yaml.load(stream)
with open("disambiguations.yaml", "r") as stream:
disambiguations = yaml.load(stream)
# first we need to check how many additional tags there are
# for that take the first element of the keys list of the
# dictionary
add_tags = 0
for element in list(disambiguations)[0]:
# if the type is str and not tuple it is an additional tag
if isinstance(element, str):
add_tags += 1
output = []
# look at all the sentences in the list
for word_pos_ner in utils.conv_2_word_pos(stanford_model,
sent_list,
is_split=is_split,
use_ner=use_ner,
ner_args=ner_args):
if use_ner:
# the actual sentence is just the first element, the second
# one is the list of strings that were replaced (i.e. the
# named-entities).
sent = word_pos_ner[0]
else:
sent = word_pos_ner
# get all the indices of the contractions
idx_lst = _extract_contractions(sent)
sent = _do_replacements(sent,
idx_lst,
add_tags,
contractions,
disambiguations)
output.append(sent)
# at this point there is definetly the next item added to
# output. So just replace the NER-tag now
if use_ner:
# just replace it in the last element
output[-1] = utils.ner_to_sent(output[-1],
word_pos_ner[1],
tag=ner_args[1])
if not is_split:
# join the sentences if they were joined in the beginning
output = [' '.join(sent) for sent in output]
# remove the space in front of the punctuations.
output = [sent.replace(" '", "'") for sent in output]
output = [sent.replace(" ;", ";") for sent in output]
output = [sent.replace(" :", ":") for sent in output]
output = [sent.replace(" .", ".") for sent in output]
output = [sent.replace(" ,", ",") for sent in output]
output = [sent.replace(" !", "!") for sent in output]
output = [sent.replace(" ?", "?") for sent in output]
return output
if __name__ == '__main__':
TEST_CASES = [
"I won't let you get away with that", # won't -> will not
"I'm a bad person", # 'm -> am
"It's his cat anyway", # 's -> is
"It's not what you think", # 's -> is
"It's a man's world", # 's -> is and 's possessive
"Catherine's been thinking about it", # 's -> has
"It'll be done", # 'll -> will
"Who'd've thought!", # 'd -> would, 've -> have
"She said she'd go.", # she'd -> she would
"She said she'd gone.", # she'd -> had
"Y'all'd've a great time, wouldn't it be so cold!",
# Y'all'd've -> You all would have, wouldn't -> would not
" My name is Jack.", # No replacements.
"'Tis questionable whether Ma'am should be going.",
# 'Tis -> it is, Ma'am -> madam
"As history tells, 'twas the night before Christmas.",
# 'Twas -> It was
"Martha, Peter and Christine've been indulging in a menage-à-trois."
# 've -> have
]
# use nltk to split the strings into words
POS_MODEL = utils.load_stanford(model='pos')
NER_MODEL = utils.load_stanford(model='ner')
# expand the sentences
EXPANDED_LIST = expand_contractions(POS_MODEL,
TEST_CASES,
is_split=False,
use_ner=True,
ner_args=[NER_MODEL, "<NE>"])
for SENT in EXPANDED_LIST:
print(SENT)