-
Notifications
You must be signed in to change notification settings - Fork 0
/
wilddiac_utils.py
226 lines (165 loc) · 6.63 KB
/
wilddiac_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from collections import namedtuple
import re
from typing import Iterable, List
import rapidfuzz.distance.Levenshtein as editdistance
from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.charsets import UNICODE_PUNCT_CHARSET
from camel_tools.utils.charsets import AR_LETTERS_CHARSET, AR_DIAC_CHARSET
from camel_tools.utils.charsets import AR_CHARSET
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.disambig.common import ScoredAnalysis
__all__ = (
'remove_contextual_diac_flags',
'fix_contextual_diacs',
'fix_diac_order',
'normalize_diac',
'OpCount',
'word_to_word_opcount',
'normalize_consonants',
'word_is_oov',
)
ALL_WASLA_RE = re.compile('\u0671')
STARTING_WASLA_RE = re.compile(u'^\u0671')
MIDDLE_WASLA_WITH_DIAC_RE = re.compile(u'^\u0671[\u064e\u064f\u0650\u0652]?')
ENDING_SUKUN_RE = re.compile(u'\u0652$')
CONTEXTUAL_DIACS_FLAG_RE = re.compile('%[mn]')
MEEM_FLAG_RE = re.compile('%m')
MIN_FLAG_RE = re.compile('%n')
DIAC_AR_AT_START_RE = re.compile('^[' + ''.join(AR_LETTERS_CHARSET) + '][' +
''.join(AR_DIAC_CHARSET) + ']')
IS_AR_RE = re.compile('^[' + re.escape(''.join(AR_CHARSET)) + ']+(%[mn])?$')
FIX_TANWEEN_RE_01 = re.compile('(\u0651?)\u064b(\u0651?)(\u0627|\u0649)')
FIX_TANWEEN_RE_02 = re.compile('(\u0627|\u0649)\u064b')
FIX_DIAC_ORDER_RE = re.compile('([\u064b-\u0650]?)(\u0651?)(\u0652?)(\u0670?)')
def is_ar_word(word: str) -> bool:
"""Returns True if a word is composed entirely of Arabic characters,
False otherwise.
"""
return IS_AR_RE.match(word) is not None
def is_punct(token: str) -> bool:
"""Returns True if a word is composed entirely of punctuation/symbolic
characters, False otherwise.
"""
for char in token:
if (char not in UNICODE_PUNCT_CHARSET):
return False
return True
def remove_contextual_diac_flags(word: str) -> str:
"""Removes contextual flags from a given word generated by our extended
Morphological Analyzer.
"""
if not is_ar_word(word):
return word
result = CONTEXTUAL_DIACS_FLAG_RE.sub('', word)
result = ALL_WASLA_RE.sub('\u0627', result)
return result
def fix_contextual_diacs(diacs: List[str]):
"""Perform contextual diacritization fixes on a given sentence.
"""
n = len(diacs)
for i in range(n):
# If a word is not an Arabic word do nothing.
if not is_ar_word(diacs[i]):
continue
# Determine if a word is the start of a context.
is_context_beginning = True
if i != 0 and not is_punct(diacs[i - 1]):
is_context_beginning = False
# If a words starts with a diacritized Alif Wasla in the middle of a
# context, remove the diacritic.
if not is_context_beginning:
diacs[i] = MIDDLE_WASLA_WITH_DIAC_RE.sub(u'\u0671', diacs[i])
# Determine if the next word begins with an Alif Wasla.
is_followed_by_wasla = False
if i < (n - 1) and diacs[i + 1][0] == u'\u0671':
is_followed_by_wasla = True
# If the current word ends with a Sukun and is followed by an
# Alif Wasla, determine the appropriate replacement for the Sukun.
# Use Kasra by default.
replacement = u'\u0650' # Kasra
# If the word contains a marker indicating third masculine plural
# or second maculine plural ending with a Meem, use a Damma.
if MEEM_FLAG_RE.search(diacs[i]) is not None:
diacs[i] = MEEM_FLAG_RE.sub('', diacs[i])
replacement = u'\u064f'
# If the word is "min" then we use the diacritic on the following
# Alif Wasla.
elif MIN_FLAG_RE.search(diacs[i]) is not None:
diacs[i] = MIN_FLAG_RE.sub('', diacs[i])
if (i < (n - 1) and
DIAC_AR_AT_START_RE.match(diacs[i+1]) is not None):
replacement = diacs[i + 1][1] # Take wasla diac
# If the current word ends with a Sukun and is followed by an
# Alif Wasla, replace Sukun with our chosen replacement.
if is_followed_by_wasla:
if diacs[i][-1] == u'\u0652':
diacs[i] = ENDING_SUKUN_RE.sub(replacement, diacs[i])
# Convert all starting Alif Waslas to an Alif.
diacs[i] = STARTING_WASLA_RE.sub(u'\u0627', diacs[i])
return diacs
def fix_tanween_alef(word: str) -> str:
"""Move Tanween Fatha preceeding an Alef to come after it.
"""
return FIX_TANWEEN_RE_01.sub('\\g<1>\\g<2>\\g<3>\u064b', word)
def fix_tanween_alef_before(word: str):
"""Move Tanween Fatha following an Alef to come before it.
"""
return FIX_TANWEEN_RE_02.sub('\u064b\\g<1>', word)
def fix_diac_order(word: str) -> str:
"""Reorder diacritics of a given word to a more natural order.
"""
word_fixed = normalize_unicode(word)
word_fixed = fix_tanween_alef_before(word_fixed)
word_fixed = FIX_DIAC_ORDER_RE.sub('\\g<2>\\g<1>\\g<3>\\g<4>', word_fixed)
return word_fixed
def normalize_diac(word) -> str:
"""Normalize the diacritics of a given word to a form suitable for
comparison.
"""
word_norm = normalize_unicode(word)
word_norm = fix_tanween_alef(word_norm)
word_norm = normalize_unicode(word_norm)
return word_norm
OpCount = namedtuple('OpCount', ['noop', 'insert', 'substitute', 'delete'])
def word_to_word_opcount(src: str, trg: str) -> OpCount:
"""Returns counts for the number of insertions, deletions, substitutions,
and no-ops for the Levenshtein edit distance between a source and taget
word.
"""
i_count = 0
s_count = 0
d_count = 0
n_count = 0
opcodes = editdistance.opcodes(src, trg)
for opcode in opcodes:
action = opcode.tag
if action == 'equal':
n_count += 1
elif action == 'insert':
i_count += 1
elif action == 'replace':
s_count += 1
elif action == 'delete':
d_count += 1
return OpCount(n_count, i_count, s_count, d_count)
"""Normalizes all Alif, Yaa/Alif Maksoura, and Haa/Taa Marbouta forms.
"""
normalize_consonants = CharMapper({
u'\u0625': u'\u0627',
u'\u0623': u'\u0627',
u'\u0622': u'\u0627',
u'\u0671': u'\u0627',
u'\u0649': u'\u064a',
u'\u0629': u'\u0647',
u'\u0640': u''
})
def word_is_oov(analyses: Iterable[ScoredAnalysis]) -> bool:
"""Returns True if either there are no analyses or all analyses are backoff
analyses, returns False otherwise.
"""
if len(analyses) == 0:
return False
for analysis in analyses:
if analysis.analysis['source'] != 'backoff':
return False
return True