-
Notifications
You must be signed in to change notification settings - Fork 2
/
match.py
378 lines (327 loc) · 15.5 KB
/
match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2006-2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
"""Class to perform translation memory matching from a store of translation units"""
import heapq
import re
from translate.search import lshtein
from translate.search import terminology
from translate.storage import base
from translate.storage import po
from translate.misc.multistring import multistring
def sourcelen(unit):
"""Returns the length of the source string"""
return len(unit.source)
def _sort_matches(matches, match_info):
def _matches_cmp(x, y):
# This function will sort a list of matches according to the match's starting
# position, putting the one with the longer source text first, if two are the same.
c = cmp(match_info[x.source]['pos'], match_info[y.source]['pos'])
return c and c or cmp(len(y.source), len(x.source))
matches.sort(_matches_cmp)
class matcher(object):
"""A class that will do matching and store configuration for the matching process"""
sort_reverse = False
def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
"""max_candidates is the maximum number of candidates that should be assembled,
min_similarity is the minimum similarity that must be attained to be included in
the result, comparer is an optional Comparer with similarity() function"""
if comparer is None:
comparer = lshtein.LevenshteinComparer(max_length)
self.comparer = comparer
self.setparameters(max_candidates, min_similarity, max_length)
self.usefuzzy = usefuzzy
self.inittm(store)
self.addpercentage = True
def usable(self, unit):
"""Returns whether this translation unit is usable for TM"""
#TODO: We might want to consider more attributes, such as approved, reviewed, etc.
source = unit.source
target = unit.target
if source and target and (self.usefuzzy or not unit.isfuzzy()):
if len(source) < 2:
return False
if source in self.existingunits and self.existingunits[source] == target:
return False
else:
self.existingunits[source] = target
return True
return False
def inittm(self, stores, reverse=False):
"""Initialises the memory for later use. We use simple base units for
speedup."""
# reverse is deprectated - just use self.sort_reverse
self.existingunits = {}
self.candidates = base.TranslationStore()
if isinstance(stores, base.TranslationStore):
stores = [stores]
for store in stores:
self.extendtm(store.units, store=store, sort=False)
self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
# print "TM initialised with %d candidates (%d to %d characters long)" % \
# (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source))
def extendtm(self, units, store=None, sort=True):
"""Extends the memory with extra unit(s).
@param units: The units to add to the TM.
@param store: Optional store from where some metadata can be retrieved
and associated with each unit.
@param sort: Optional parameter that can be set to False to supress
sorting of the candidates list. This should probably only be used in
inittm().
"""
if isinstance(units, base.TranslationUnit):
units = [units]
candidates = filter(self.usable, units)
for candidate in candidates:
simpleunit = base.TranslationUnit("")
# We need to ensure that we don't pass multistrings futher, since
# some modules (like the native Levenshtein) can't use it.
if isinstance(candidate.source, multistring):
if len(candidate.source.strings) > 1:
simpleunit.orig_source = candidate.source
simpleunit.orig_target = candidate.target
simpleunit.source = unicode(candidate.source)
simpleunit.target = unicode(candidate.target)
else:
simpleunit.source = candidate.source
simpleunit.target = candidate.target
# If we now only get translator comments, we don't get programmer
# comments in TM suggestions (in Pootle, for example). If we get all
# notes, pot2po adds all previous comments as translator comments
# in the new po file
simpleunit.addnote(candidate.getnotes(origin="translator"))
simpleunit.fuzzy = candidate.isfuzzy()
self.candidates.units.append(simpleunit)
if sort:
self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
"""Sets the parameters without reinitialising the tm. If a parameter
is not specified, it is set to the default, not ignored"""
self.MAX_CANDIDATES = max_candidates
self.MIN_SIMILARITY = min_similarity
self.MAX_LENGTH = max_length
def getstoplength(self, min_similarity, text):
"""Calculates a length beyond which we are not interested.
The extra fat is because we don't use plain character distance only."""
return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
def getstartlength(self, min_similarity, text):
"""Calculates the minimum length we are interested in.
The extra fat is because we don't use plain character distance only."""
return max(len(text) * (min_similarity/100.0), 1)
def matches(self, text):
"""Returns a list of possible matches for given source text.
@type text: String
@param text: The text that will be search for in the translation memory
@rtype: list
@return: a list of units with the source and target strings from the
translation memory. If self.addpercentage is true (default) the match
quality is given as a percentage in the notes.
"""
bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
#We use self.MIN_SIMILARITY, but if we already know we have max_candidates
#that are better, we can adjust min_similarity upwards for speedup
min_similarity = self.MIN_SIMILARITY
# We want to limit our search in self.candidates, so we want to ignore
# all units with a source string that is too short or too long. We use
# a binary search to find the shortest string, from where we start our
# search in the candidates.
# minimum source string length to be considered
startlength = self.getstartlength(min_similarity, text)
startindex = 0
endindex = len(self.candidates.units)
while startindex < endindex:
mid = (startindex + endindex) // 2
if sourcelen(self.candidates.units[mid]) < startlength:
startindex = mid + 1
else:
endindex = mid
# maximum source string length to be considered
stoplength = self.getstoplength(min_similarity, text)
lowestscore = 0
for candidate in self.candidates.units[startindex:]:
cmpstring = candidate.source
if len(cmpstring) > stoplength:
break
similarity = self.comparer.similarity(text, cmpstring, min_similarity)
if similarity < min_similarity:
continue
if similarity > lowestscore:
heapq.heapreplace(bestcandidates, (similarity, candidate))
lowestscore = bestcandidates[0][0]
if lowestscore >= 100:
break
if min_similarity < lowestscore:
min_similarity = lowestscore
stoplength = self.getstoplength(min_similarity, text)
#Remove the empty ones:
def notzero(item):
score = item[0]
return score != 0
bestcandidates = filter(notzero, bestcandidates)
#Sort for use as a general list, and reverse so the best one is at index 0
bestcandidates.sort(reverse=True)
return self.buildunits(bestcandidates)
def buildunits(self, candidates):
"""Builds a list of units conforming to base API, with the score in the comment"""
units = []
for score, candidate in candidates:
if hasattr(candidate, "orig_source"):
candidate.source = candidate.orig_source
candidate.target = candidate.orig_target
newunit = po.pounit(candidate.source)
newunit.target = candidate.target
newunit.markfuzzy(candidate.fuzzy)
candidatenotes = candidate.getnotes().strip()
if candidatenotes:
newunit.addnote(candidatenotes)
if self.addpercentage:
newunit.addnote("%d%%" % score)
units.append(newunit)
return units
# We don't want to miss certain forms of words that only change a little
# at the end. Now we are tying this code to English, but it should serve
# us well. For example "category" should be found in "categories",
# "copy" should be found in "copied"
#
# The tuples define a regular expression to search for, and with what it
# should be replaced.
ignorepatterns = [
("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied
("[\s-]+", ""), #down time / downtime, pre-order / preorder
("-", " "), #pre-order / pre order
(" ", "-"), #pre order / pre-order
]
ignorepatterns_re = [(re.compile(a), b) for (a, b) in ignorepatterns]
context_re = re.compile("\s+\(.*\)\s*$")
class terminologymatcher(matcher):
"""A matcher with settings specifically for terminology matching"""
sort_reverse = True
def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
if comparer is None:
comparer = terminology.TerminologyComparer(max_length)
matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
self.addpercentage = False
self.match_info = {}
def inittm(self, store):
"""Normal initialisation, but convert all source strings to lower case"""
matcher.inittm(self, store)
extras = []
for unit in self.candidates.units:
source = unit.source = context_re.sub("", unit.source).lower()
for ignorepattern_re, replacement in ignorepatterns_re:
(newterm, occurrences) = ignorepattern_re.subn(replacement, source)
if occurrences:
new_unit = type(unit).buildfromunit(unit)
new_unit.source = newterm
# We mark it fuzzy to indicate that it isn't pristine
unit.markfuzzy()
extras.append(new_unit)
self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
if extras:
# We don't sort, so that the altered forms are at the back and
# considered last.
self.extendtm(extras, sort=False)
def getstartlength(self, min_similarity, text):
# Let's number false matches by not working with terms of two
# characters or less
return 3
def getstoplength(self, min_similarity, text):
# Let's ignore terms with more than 50 characters. Perhaps someone
# gave a file with normal (long) translations
return 50
def usable(self, unit):
"""Returns whether this translation unit is usable for terminology."""
if not unit.istranslated():
return False
l = len(context_re.sub("", unit.source))
return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
def matches(self, text):
"""Normal matching after converting text to lower case. Then replace
with the original unit to retain comments, etc."""
text_l = len(text)
if text_l < self.getstartlength(0, ''): # parameters unused
# impossible to return anything
return []
text = text.lower()
comparer = self.comparer
comparer.match_info = {}
match_info = {}
matches = []
known = set()
# We want to limit our search in self.candidates, so we want to ignore
# all units with a source string that is too long. We use binary search
# to find the first string short enough to occur in text, from where we
# start our search in the candidates.
# the maximum possible length is text_l
startindex = 0
endindex = len(self.candidates.units)
while startindex < endindex:
mid = (startindex + endindex) // 2
if sourcelen(self.candidates.units[mid]) > text_l:
startindex = mid + 1
else:
endindex = mid
for cand in self.candidates.units[startindex:]:
source = cand.source
if (source, cand.target) in known:
continue
if comparer.similarity(text, source, self.MIN_SIMILARITY):
match_info[source] = {'pos': comparer.match_info[source]['pos']}
matches.append(cand)
known.add((source, cand.target))
final_matches = []
lastend = 0
_sort_matches(matches, match_info)
for match in matches:
start_pos = match_info[match.source]['pos']
if start_pos < lastend:
continue
end = start_pos + len(match.source)
final_matches.append(match)
# Get translations for the placeable
for m in matches:
if m is match:
continue
m_info = match_info[m.source]
m_end = m_info['pos']
if m_end > start_pos:
# we past valid possibilities in the list
break
m_end += len(m.source)
if start_pos == m_info['pos'] and end == m_end:
# another match for the same term
final_matches.append(m)
lastend = end
if final_matches:
self.match_info = match_info
return final_matches
# utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries
def unit2dict(unit):
"""converts a pounit to a simple dict structure for use over the web"""
return {
"source": unit.source,
"target": unit.target,
"quality": _parse_quality(unit.getnotes()),
"context": unit.getcontext()
}
def _parse_quality(comment):
"""extracts match quality from po comments"""
quality = re.search('([0-9]+)%', comment)
if quality:
return quality.group(1)