This repository has been archived by the owner on May 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
matcher.py
317 lines (230 loc) · 9.6 KB
/
matcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
'''The implementation of the sigil recognition algorithm. match_sigils() is the
main interface.'''
import math
import re
from boxlookup import BoxLookup
from collections import defaultdict, Counter
import sigil
class Match(object):
def __init__(self, sig, start):
self.sig = sig
self.start = start
self.end = start + len(sig.ops)
# adjacency lists and validity status for alignment check
self.next_matches = []
self.prev_matches = []
self.passes_alignment_check = False
def get_series(self):
'''During the alignment check, find all matches which are aligned
with this one, ie all the characters following this one in the current
word.'''
current = self
result = [self]
while len(current.next_matches) > 0:
current = current.next_match()
result.append(current)
return result
def next_match(self):
'''Return the next aligned match, ie the one with the smallest
x or y coordinate depending on angle.'''
if self.sig.angle == 0:
key = lambda m: m.origin[0]
elif self.sig.angle == -90:
key = lambda m: m.origin[1]
return min(self.next_matches, key=key)
def __iter__(self):
'''Hack for compatibility with old tuples.'''
if hasattr(self, 'origin'):
return iter( (self.sig, self.origin, self.sf) )
else:
return iter( (self.sig, self.start, self.end) )
def match_sigils(sigdict, abs_ops, skip_alignment_check=False):
if len(abs_ops) < 2:
return []
non_zero_abs_ops = sigil.remove_zero_ops(abs_ops)
ops = sigil.diff_ops(non_zero_abs_ops)
matches = match_without_scale(sigdict, ops)
matches = check_scales(matches, non_zero_abs_ops, ops)
matches = remove_submatches(matches)
if not skip_alignment_check:
matches = check_alignment(sigdict, matches)
print count_ambiguous(matches)
return matches
def match_without_scale(sigdict, ops):
'''Find all possible matches based only on the directions of each line.
Returns an array matches of the form:
[Match(sigil, start)]
where start is the index of the first operation.'''
matches = []
# wrap in ms to make it easier for the regex to match the divisions between
# continuous lines
all_opcodes = 'm{}m'.format(''.join(opcode for (_,opcode) in ops))
# combine sigils with rotated copies
all_sigils = sum(sigdict.values(), [])
all_sigils = all_sigils + [sig.rotated(-90) for sig in all_sigils]
for sig in all_sigils:
# first find all places where the correct sequence of operation types is present
sig_regex = '(?=m{}m)'.format(''.join(opcode for (_,opcode) in sig.ops))
possible_starts = [m.start() for m in
re.finditer(sig_regex, all_opcodes)]
# now at each position, check the directions of the operations are correct
for start in possible_starts:
doc_ops = ops[start : start+len(sig.ops)]
if all(match_op(doc_op, sig_op) for (doc_op, sig_op) in zip(doc_ops, sig.ops)):
matches.append( Match(sig, start) )
return matches
def match_op(step1, step2, tol=0.93):
'''Compare two operations and return True if they are in the same direction.'''
((x1, y1), c1) = step1
((x2, y2), c2) = step2
n1 = math.sqrt(x1*x1+y1*y1)
n2 = math.sqrt(x2*x2+y2*y2)
if n1 < 0.01 or n2 < 0.01:
return n1 < 0.01 and n2 < 0.01
if c1 == 'c':
tol = 0.7
return (x1*x2+y1*y2)/n1/n2 > tol
def remove_submatches(matches):
'''Given a list of matches, removing all submatches.
A submatch is a match whose operations are all part of a larger match, eg
matching a hyphen on the crossbar of an A.
Additionally, if the same letter is matched twice at the same point, the
earlier match is removed.'''
if len(matches) == 0:
return []
def end_then_start(match):
return (match.end, -match.start)
deleted = []
matches = sorted(matches, key=end_then_start)
supermatch = matches[-1]
for (i, m) in reversed(list(enumerate(matches[:-1]))):
if m.start > supermatch.start or \
(m.start == supermatch.start and m.end < supermatch.end) or \
(m.start == supermatch.start and m.end == supermatch.end and
m.sig.char == supermatch.sig.char):
deleted.append((m.sig.char, supermatch.sig.char))
del matches[i]
if m.start < supermatch.start:
supermatch = m
return matches
def check_scales(matches, abs_ops, ops):
'''Given a list of matches, and the ops that they come from, calculate the
scale factor of each match.
Matches with inconsistent scale factors are removed.
Returns a list of matches with the sf and origin members added.'''
processed_matches = []
for m in matches:
doc_ops = ops[m.start:m.end]
doc_sf = sigil.ops_scale(doc_ops) / m.sig.scale
# check scale factor of each operation
scale_error = False
for sig_op, doc_op in zip(m.sig.ops, doc_ops):
assert sig_op[1] == doc_op[1], "match_op() isn't doing its job"
sig_n = math.sqrt(sig_op[0][0]**2 + sig_op[0][1]**2)
doc_n = math.sqrt(doc_op[0][0]**2 + doc_op[0][1]**2)
# set position tolerance based on operation type
if sig_op[1] == 'c':
tol = max(4 * doc_sf, 0.3)
else:
tol = 0.3
if sig_n < 0.01 or doc_n < 0.01:
assert sig_n < 0.01 and doc_n < 0.01, "match_op() isn't doing its job"
else:
len_error = abs(doc_n - sig_n * doc_sf)
if len_error > tol:
scale_error = True
# get absolute position of sig origin
start_op = abs_ops[m.start]
origin = [a+b*doc_sf for a,b in zip(start_op[0], m.sig.origin)]
if scale_error is False:
m.origin = origin
m.sf = doc_sf
processed_matches.append(m)
return processed_matches
def check_alignment(sigdict, matches):
'''Filter out single-operation matches that aren't aligned with characters
just beforehand (ie, no underscores, slashes or hyphens at the start of a
word).
To be accepted, a single-operation match must be on the same y level as an
accepted match (ie on the same line), and must be the right x distance away
to be the next character.
The main advantage to this step is that it disambiguates between hyphens
and underscores.'''
# we need V to measure the font
if 'V' not in sigdict:
return matches
# estimate font metrics based on V, the widest character
v_width = sigdict['V'][0].width
gap_width = v_width / 2.58 # gap between adjacent characters
space_width = gap_width * 2 # the width of a space character
epsilon = 0.001 # minimum gap between characters
# hardcoded y alignment tolerance
max_y_sep = 0.7
matches_bl = BoxLookup(matches)
# for each match, work out the range of possible starts of the next character:
# FROM x + width,
# y - max_y_sep
# TO x + width + gap + space + gap + one more gap for tolerance,
# y + max_y_sep
# or the corresponding rotated coordinates for a rotated character
#
# then, check all matches in this box for alignment and record them
for i, m in enumerate(matches):
x, y = m.origin
assert m.sig.angle in [0, -90]
if m.sig.angle == 0:
box = [
x + m.sf * m.sig.width + epsilon,
y - max_y_sep,
x + m.sf * (m.sig.width + 3 * gap_width + space_width),
y + max_y_sep,
]
elif m.sig.angle == -90:
box = [
x - max_y_sep,
y + m.sf * m.sig.width + epsilon,
x + max_y_sep,
y + m.sf * (m.sig.width + 3 * gap_width + space_width),
]
for m2 in matches_bl.search(*box):
# remove matches at different angles or scales
if m2.sig.angle != m.sig.angle:
continue
if m2.sf/m.sf < 0.9 or 1.1 < m2.sf/m.sf:
continue
# add edge to graph
m2.prev_matches.append(m)
m.next_matches.append(m2)
# identify whether matches are in valid serieses
for m in matches:
if m.prev_matches != []:
# not start of series
continue
series = m.get_series()
if series_is_valid(series):
for m2 in series:
m2.passes_alignment_check = True
# finally, exclude invalid matches
matches = [m for m in
matches if m.passes_alignment_check]
return matches
def series_is_valid(series):
'''A series of matches is valid if at least one has more than two
operations, and the series doesn't suffer case ambiguity (P, V, W, X, Z)'''
if all(m.sig.char.upper() in 'PVWXZ' for m in series):
return False
def match_is_valid(m):
return len(m.sig.ops) > 2
return any(match_is_valid(m) for m in series)
def count_ambiguous(matches):
'''Return a Counter() of sigils matching the same operations in the
document.'''
sigils_by_position = defaultdict(list)
for m in matches:
sigils_by_position[(m.start, len(m.sig.ops))].append(m)
ctr = Counter()
for ambiguous in sigils_by_position.values():
if len(ambiguous) > 1:
chars = tuple(sorted(m.sig.char for m in ambiguous))
ctr[chars] += 1
return ctr