-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix.go
213 lines (186 loc) · 6.55 KB
/
matrix.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
// Levenshtein implements the Levenshtein (edit distance and slice diff) algorithm for golang
package levenshtein
// Number is the magnitude of the Levenshtein distance constraint
// (the type such as integer which can be used to express the difference between characters or the two compared slices elements)
type Number interface {
int8 | uint8 | int16 | uint16 | int32 | uint32 | int64 | uint64 | int | uint | float32 | float64
}
// One returns the nonzero Levenshtein distance with magnitude of 1
func One[T Number](uint) *T {
var n T
n = 1
return &n
}
// Distance returns the edit distance from the edit matrix (the last element),
// or nil if the matrix is of size 0. Used to figure out the Edit distance
// (how many minimum edits needed) from the matrix solved by Matrix.
func Distance[T any](d []T) *T {
if len(d) == 0 {
return nil
}
return &d[len(d)-1]
}
// OneSlice instantiates the default substitution cost callback which calculates
// the substitution cost between two comparable slices.
// The substitution cost is none if the slice elements are the same, and One if
// the slice elements are not equal.
// You can implement something like this to customize the substitution cost used by the slices algorithm.
func OneSlice[C comparable, T Number](a, b []C) func(uint, uint) *T {
return func(x, y uint) *T {
if a[x] == b[y] {
return (*T)(nil)
}
var n T
n = 1
return &n
}
}
// OneElements is the default substitution cost callback which calculates
// the substitution cost between two comparable elements.
// The substitution cost is none if the slice elements are the same, and One if
// the slice elements are not equal.
// You can implement something like this to customize the substitution cost used by the elements of slices algorithm.
func OneElements[C comparable, T Number](a *C, b *C) *T {
if *a == *b {
return (*T)(nil)
}
var n T
n = 1
return &n
}
// OneString instantiates the default substitution cost callback which calculates
// the substitution cost between two strings.
// The substitution cost is none if the string bytes are the same, and One if
// the string bytes are not equal.
// You can implement something like this to customize the substitution cost used by the strings algorithm.
func OneString[T Number](a, b string) func(uint, uint) *T {
return func(x, y uint) *T {
if a[x] == b[y] {
return (*T)(nil)
}
var n T
n = 1
return &n
}
}
// Kernel is the default Levenshtein algorithm kernel. You can customize the kernel to modify any costs or the algorithm calculation itself.
func Kernel[T Number](d []T, i uint, j uint, n uint, cost *T, delCost *T, insCost *T) {
// Calculate deletion, insertion, and substitution costs
var del = d[n*(i-1)+j] + *delCost
var ins = d[n*i+(j-1)] + *insCost
var sub = d[n*(i-1)+(j-1)] + *cost
// Pointers to current cell and backtracking info
var cell = &d[n*i+j]
// Update cost in the current cell
if del < ins {
if del < sub {
*cell = del
} else {
*cell = sub
}
} else {
if ins < sub {
*cell = ins
} else {
*cell = sub
}
}
}
// MatrixT is the transposed Levenshtein algorithm. Like Matrix except the the source sequence and the target sequence
// are exchanged. MatrixT returns a transposed matrix, while Matrix returns the original matrix.
func MatrixT[T Number](m uint, n uint, deletion func(uint) *T, insertion func(uint) *T,
substCost func(uint, uint) *T, kernel func(d []T, i uint, j uint, n uint, cost *T, delCost *T, insCost *T)) []T {
return Matrix(n, m, insertion, deletion, func(m uint, n uint) *T {
return substCost(n, m)
}, kernel)
}
// Matrix is the default Levenshtein algorithm. M is the size of the first sequence, N is the size of the second sequence.
// If the deletion, insertion, or substCost callbacks are not provided (i.e., passed as nil),
// the function uses default behaviors: insertions and deletions are treated as having a cost of 1,
// and substitutions are based on whether the characters are identical.
// The kernel actually returns the edit distance between two elements (you can customize this, or pass nil).
func Matrix[T Number](m uint, n uint, deletion func(uint) *T, insertion func(uint) *T,
substCost func(uint, uint) *T, kernel func(d []T, i uint, j uint, n uint, cost *T, delCost *T, insCost *T)) []T {
if kernel == nil {
kernel = Kernel[T]
}
if deletion == nil {
deletion = One[T]
}
if insertion == nil {
insertion = One[T]
}
m++
n++
var d = make([]T, m*n)
var none T
for i := uint(1); i < m; i++ {
var dele *T
if deletion != nil {
dele = deletion(i - 1)
}
if dele == nil {
d[n*i+0] = d[n*(i-1)+0] + none
} else {
d[n*i+0] = d[n*(i-1)+0] + *dele
}
}
for j := uint(1); j < n; j++ {
var inse *T
if insertion != nil {
inse = insertion(j - 1)
}
if inse == nil {
d[n*0+j] = d[n*0+j-1] + none
} else {
d[n*0+j] = d[n*0+j-1] + *inse
}
}
for j := uint(1); j < n; j++ {
for i := uint(1); i < m; i++ {
var cost, delCost, insCost *T
if substCost != nil {
cost = substCost(i-1, j-1)
}
if deletion != nil {
delCost = deletion(i - 1)
}
if insertion != nil {
insCost = insertion(j - 1)
}
if cost == nil {
cost = &none
}
if delCost == nil {
delCost = &none
}
if insCost == nil {
insCost = &none
}
kernel(d, i, j, n, cost, delCost, insCost)
}
}
return d
}
// MatrixTSlices is the transposed Levenshtein algorithm for edit distance between slices. Use this instead of MatrixT,
// if your inputs are slices anyway. First two parameters are comparable slices, the other parameters can be nil or customized.
func MatrixTSlices[T Number, S comparable](ms, ns []S, deletion func(uint) *T, insertion func(uint) *T,
substCost func(*S, *S) *T, kernel func(d []T, i uint, j uint, n uint, cost *T, delCost *T, insCost *T)) []T {
if substCost == nil {
substCost = OneElements[S, T]
}
return MatrixT(uint(len(ms)), uint(len(ns)), deletion, insertion, func(m uint, n uint) *T {
return substCost(&ms[m], &ns[n])
}, kernel)
}
// MatrixSlices is the Levenshtein algorithm for edit distance between slices. Use this instead of Matrix,
// if your inputs are slices anyway. First two parameters are comparable slices, the other parameters can be nil or customized.
func MatrixSlices[T Number, S comparable](ms, ns []S, deletion func(uint) *T, insertion func(uint) *T,
substCost func(*S, *S) *T, kernel func(d []T, i uint, j uint, n uint, cost *T, delCost *T, insCost *T)) []T {
if substCost == nil {
substCost = OneElements[S, T]
}
return Matrix(uint(len(ms)), uint(len(ns)), deletion, insertion, func(m uint, n uint) *T {
return substCost(&ms[m], &ns[n])
}, kernel)
}