-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
339 lines (300 loc) · 14.1 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import sys, re, csv, os, roman
from unidecode import unidecode
nemo_on = True #often skip this for debugging because it is slow
from nemo_text_processing.text_normalization.normalize import Normalizer
normalizer = None
if nemo_on:
normalizer = Normalizer(input_case='cased', lang='en' )
#TODO: handle long parentheticals
#Notes and lists that tend to appear at the end of an article and are not suitable for reading
def csv_to_dict(csv_file):
csv_file_exp = os.path.expandvars(csv_file)
with open(csv_file_exp, mode='r') as infile:
reader = csv.reader(infile)
mydict = {rows[0]:rows[1] for rows in reader}
return mydict
replace_acronyms = csv_to_dict ("$BASEDIR/wikipedia-tts/pronounced_acronyms.csv")
bulk_replace_dict = csv_to_dict ("$BASEDIR/wikipedia-tts/bulk_replace.csv") # long context-independent strings
def do_headings (text):
lines = text.splitlines()
heading_pattern = r'^==[^=]+==$'
subheading_pattern = r'^==='
heading_counter = 1
output = ""
for line in lines:
if re.match (heading_pattern, line):
output = output + "==" + "Main heading " + str (heading_counter) + ", " + line[2:] + " [period] .\n "
heading_counter += 1
elif re.match (subheading_pattern, line):
index = 0 # could be three or four depending on level
while index < len(line) and line[index] == '=':
index += 1
output = output + line[:index] + "Subheading, "+ line[index:] + "[period] .\n "
else: output = output + line + "\n"
output = re.sub ("=[=]+", "", output)
return output
def remove_boring_end(text):
dictionary = {"== Books ==", "== Honors and awards ==" , "== Bibliography ==" , "== Speeches and works ==","== Primary sources ==","== External links ==","== References ==","== Notes and References ==","== See Also ==","== Honours ==","== Honors ==","== Gallery ==","== See also ==","== Further reading ==","== External links ==","== Works= ="}
positions=[len(text)]
for ending in dictionary:
pos = text.find (ending)
if ( pos != -1 ): positions.append ( pos ) #if found then add it
first_pos = min (positions)
return text[:first_pos]
# makes $1.2 billion into 1.2 billion dollars which TTS handles more gracefully
def money_replace(matchobj):
moneystr = matchobj.group(0)[1:] # string slice off $
return moneystr + " dollars "
# things that can be bulk replaced, don't require regex or other context
def bulk_replace (text):
for key in bulk_replace_dict.keys():
text = text.replace(key, bulk_replace_dict[key])
return text
# This method from stack exchange https://stackoverflow.com/a/31505798/720763 CC-BY-SA-4.0
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
def gh_sentences(text):
text = " " + text + " "
# text = re.sub('==*', '.', text) # wikipedia headers
text = text.replace("\n",".")
text = text.replace("-"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
if "..." in text: text = text.replace("...","<prd><prd><prd>")
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
text = text.replace(": ",":<stop>")
text = text.replace(";",";<stop>")
text = text.replace("<prd>",".")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
# Makes shorthand ordinals like "1st" into "first" which TTS reads more reliably
def ordinal_replace(matchobj):
n = matchobj.group(0)[1:]
n = int(n)
if 11 <= (n % 100) <= 13:
suffix = 'th'
else:
suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
return str(n) + suffix
def acronym_split (matchobj):
text = matchobj.group(0)
if text in replace_acronyms: return replace_acronyms[text]
return " ".join(text) # the default is split with spaces so each letter spoken see https://stackoverflow.com/a/18221460/
def get_middle_comma (sentence):
if len (sentence) ==0: return 0
comma_indices = [i for i, char in enumerate(sentence) if char == ',']
space_indices = [i for i, char in enumerate(sentence) if char == ' ']
middle_index = len(sentence) // 2
nearest_comma_index = None
# Find the nearest comma that is not in the middle of a number
for index in comma_indices:
if not (index == 0 ) and not (index > ( len (sentence) - 2 )):
if not sentence[index-1].isdigit() and not sentence[index+1].isdigit():
if nearest_comma_index is None or abs(index - middle_index) < abs(nearest_comma_index - middle_index):
nearest_comma_index = index
# If no valid comma is found, find the nearest space
if nearest_comma_index is None:
nearest_space_index = min(space_indices, key=lambda x: abs(x - middle_index))
return nearest_space_index
return nearest_comma_index
def date_ranges (matchobj):
text = matchobj.group(0)
text = text.replace("(","from ")
text = text.replace(")"," ")
text = text.replace("-"," to ")
text = text.replace("—"," to ")
return text
def number_ranges (matchobj):
text = matchobj.group(0)
text = text.replace("-"," to ")
text = text.replace("—"," to ")
return text
def normalize_local ( text ):
if nemo_on:
return normalizer.normalize (text, verbose=False, punct_post_process=True)
else:
return text
def spell_out_units ( matchobj ):
text = matchobj.group(0)
length_dict = {"km2": "square kilometers",
"mm2": "square millimeters",
"m2": "square meters",
"sq mi": "square miles",
"km": "kilometers",
"mm": "millimeters",
"cm": "centimeters",
"nm": "nanometers",
"ft": "feet",
"in)": "inches)", # false positives without parens
"lbs": "pounds",
"lb": "pounds",
"kg": "kilograms",
"g": "grams",
"mi": "miles",
"m": "meters",
"oz": "ounces",
"yd": "yards",
"%":" percent"
}
for key, value in length_dict.items():
if ( key in text):
return text.replace(key, value) # don't want to do more than on to avoid metersmiles
return text
def birth_death_dates ( matchobj ):
text = matchobj.group(0)
text = text.replace("b."," who was born in ")
text = text.replace("d."," who died in ")
text = text.replace(")","")
text = text.replace("(","")
return text
#since there will be a lot of these
def replace_decimal_points ( matchobj ):
text = matchobj.group(0)
text = re.sub(r'(\d)[.](\d)', r'\1 point \2', text)
return text
#since there will be a lot of these
def fractions ( matchobj ):
text = matchobj.group(0)
text = text.replace("+"," and ")
text = re.sub(r'1[/]2', r' one half ', text)
text = re.sub(r'(\d)[/]4', r'\1 quarters ', text)
text = re.sub(r'(\d)[/]8', r'\1 eighths ', text)
text = re.sub(r'(\d)[/]16', r'\1 sixtennths ', text)
text = re.sub(r'(\d)[/]32', r'\1 thirty seconds ', text)
text = re.sub(r'(\d)[/]64', r'\1 sixth fourths ', text)
return text
def circas ( matchobj ):
text = matchobj.group(0)
text = text.replace("c."," sirka ")
return text
def clause_split ( matchobj ):
text = matchobj.group(0)
text = text.replace(",",",<stop>")
return text
def make_ordinal(n):
n = int(n)
if 11 <= (n % 100) <= 13:
suffix = 'th'
else:
suffix = ['th', 'st', 'nd', 'rd', 'th'][min(n % 10, 4)]
return str(n) + suffix
def monarch_replace ( matchobj ):
text = matchobj.group(0)
if ( "Malcolm X" in text ): return text; # Brother Malcolm is not the tenth
name = re.search ('[A-Z][a-z]+ ', text).group(0)
roman_n = re.search (' [XIV]+[ ,.]', text).group(0).strip()
roman_n = roman_n.replace (",","")
roman_n = roman_n.replace (".","")
if ( len ( roman_n ) == 1 ): print("Verify monarch:" + text , file=sys.stderr) # low confidence with single digit, could be something else
int_n = roman.fromRoman ( roman_n )
ord_n = make_ordinal ( int_n )
return name + "the " + ord_n + text[-1]
#since there will be a lot of these
def template_method ( matchobj ):
text = matchobj.group(0)
return text
# Nemo text processing, turns 830s into eight hundred and thirty S, its bug report #72
def decade_pluralfix ( matchobj ):
text = matchobj.group(0)
text = text.replace ("ty S ", "ties ")
text = text.replace ("ten S ", "tens ")
return text
def preprocess (text):
text = remove_boring_end (text)
text = re.sub('{.*?\}', '', text) #curly brace formatting
text = do_headings (text)
text = text.replace("\u2212","minus ")
text = text.replace("\u2014","-")
text = text.replace("{","")
text = text.replace("}","")
text = text.replace("–","-")
text = text.replace("–","-")
text = text.replace(" "," ")
text = bulk_replace (text)
text = re.sub(': ?[0-9]+-[0-9]+', '', text) # Wikipedia pincites
text = re.sub(':[ ]?[0-9]+', '', text) # Wikipedia pincites
text = unidecode (text)
text = text.replace("degF"," degrees fahrenheit " )
text = text.replace("degC"," degrees celsius ")
text = text.replace("--","-")
text = text.replace("[[","")
text = text.replace("]]","")
text = re.sub(',[ ]?(from|as well as|during|so|and|or|because|such|including|but|that|whereas|where|when|with|whether|for example|rather|although|who|in which|which)', clause_split, text)
text = re.sub('[( -]c[.] ?[0-9]+', circas, text)
text = re.sub('\([bd][.] ?[0-9]+\)', birth_death_dates, text)
text = re.sub('#[0-9][0-9]*', ordinal_replace, text)
text = re.sub('\$[0-9.]* ?[bmtz]illion', money_replace, text)
text = re.sub('[A-Z][a-z]+ [XIV]+[ ,.]', monarch_replace, text)
text = re.sub('\([0-9][0-9]?[0-9]?[0-9]?[ADBC ]*[-][0-9][0-9]?[0-9]?[0-9]?[ADBC ]*\)', date_ranges, text)
text = re.sub('(\d)[-](\d)', r'\1 to \2' , text) #number ranges, maybe I don't need the method below
text = re.sub('[,0-9]+[-][0-9,]+', number_ranges, text)
text = re.sub('\(Coordinates:[^)]+\)', '', text)
text = re.sub('\d+\s*\d*\s*(m|m2|km|km2|mm2|ft|in|lb|lbs|g|kg|ha|sq mi|cm|mm|nm|km|ft|in|yd|oz|%)\W', spell_out_units, text)
text = re.sub('\d+\s*to \d+\s*(m|m2|km|km2|mm2|ft|in|lb|lbs|g|kg|ha|sq mi|cm|mm|nm|km|ft|in|yd|oz|%)\W', spell_out_units, text)
text = text.replace("kilometers2","square kilometers") # not sure why above doesn't catch this, kluge
text = text.replace("mm2","square millimeters") # seems safe? pretty unique string
text = re.sub('[+]?\d\d?/\d+', fractions, text)
text = re.sub('[,0-9.]+', replace_decimal_points, text)
sentences_in = gh_sentences (text)
sentences_out = []
for sen in sentences_in:
sen = re.sub ('[A-Z][A-Z]*', acronym_split, sen)
sen = sen[:-3].replace("."," ") + sen[-3:] # this are going to be stuff like degrees, initials, etc at this point. Leaving terminating periods.
# sen = sen.replace(":"," ") # colons and semicolons were already used to do sentence breaks
# sen = sen.replace(";"," ")
sen = sen.replace("/"," ")
sen = sen.replace("+"," ")
sen = sen.replace("*"," ")
sen = sen.replace("(",",") # tortoise seems to pause more for commas than parens
sen = sen.replace(")",",")
sen = sen.replace(",,",",")
sen = sen.replace(" ,",", ")
sen = sen.replace("[period]",".")
sen = sen.replace("-",",")
sen = sen.replace(" . .",".") # mystified where these came from
sen = sen.replace(" ...",".")
sen = sen.replace(" "," ")
# sen = sen.replace(" .|",".|")
sen = sen.strip()
if ( len (sen) < 3 ): continue #these are stubs often just a period
if ( len (sen) > 270 ): # this sets softmax on sentence length (longer may exist if couldn't be broken up)
middleish_comma = get_middle_comma ( sen )
sentences_out.append ( normalize_local ( sen[:middleish_comma] ) )
sentences_out.append ( normalize_local ( sen[middleish_comma+1:] ) )
else: sentences_out.append ( normalize_local ( sen ) )
sentences_in = sentences_out
sentences_out = []
senbuffer = ""
lengthcount = 0
while len ( sentences_in ) > 0:
sen = sentences_in.pop(0)
if len ( senbuffer ) + len ( sen ) > 270:
senbuffer = re.sub ("hundred and (ten|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety) [Ss] ", decade_pluralfix, senbuffer)
senbuffer = senbuffer.replace ("thousand and ten S", "thousand tens")
sentences_out.append ( senbuffer )
senbuffer = sen
else: senbuffer = senbuffer + sen
if len (senbuffer) > 0 : sentences_out.append ( senbuffer )
return sentences_out