diff --git a/share/minizinc/linear/fzn_edit_distance.mzn b/share/minizinc/linear/fzn_edit_distance.mzn new file mode 100644 index 000000000..ba01b9ce1 --- /dev/null +++ b/share/minizinc/linear/fzn_edit_distance.mzn @@ -0,0 +1,375 @@ +% Longest common subsequence distance (insertion cost = deletion cost = 1, substitution cost = 2) +predicate fzn_edit_distance(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { int: len = length(S1); + var 0..len: p1; + var 0..len: p2; + var 0..len: kept_sum; + array[1..len] of var 0..len: kept_r1_from_position; + array[1..len] of var 0..len: kept_r2_from_position; + } in + ED = p1 + p2 - 2 * kept_sum + /\ + fzn_edit_distance_kept(max_char, array1d(1..len, S1), array1d(1..len, S2), p1, p2, kept_sum); + +% Fixed insertion/deletion/substitution costs +predicate fzn_edit_distance(int: max_char, + int: C_ins, + int: C_del, + int: C_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { int: len = length(S1); + var 0..len: insertions; + var 0..len: deletions; + var 0..len: substitutions; + } in + ED = C_ins * insertions + C_sub * substitutions + C_del * deletions + /\ + fzn_edit_distance_kept_general(max_char, array1d(1..len, S1), array1d(1..len, S2), insertions, deletions, substitutions); + +% Linearized decomposition from the paper "Constraint-Based Scheduling for Paint Shops in the Automotive Supply Industry", Winter and Musliu, ACM Transactions on Intelligent Systems and Technology, 2020. +predicate fzn_edit_distance_kept(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: p1, + var int: p2, + var int: kept_sum) = + let { int: len = length(S1); + array[1..len] of var 0..len: kept_r1_from_position; + array[1..len] of var 0..len: kept_r2_from_position; + array[1..len,0..len] of var 0..1: kept_r2_from_position_i; + array[1..len,0..len] of var 0..1: kept_r1_from_position_i; + array[1..len] of var 1..max_char: z; + array[1..len] of var 0..1: kept; + array[1..len] of var 0..1: used1; + array[1..len] of var 0..1: used2; + array[1..len] of var 0..max_char: kept_char; + } in + % see which positions are used (i.e. positions which are not 0) + forall(i in 1..len)( + S1[i] <= max_char*used1[i] + /\ + used1[i] < 1 + S1[i] + /\ + S2[i] <= max_char*used2[i] + /\ + used2[i] < 1 + S2[i] + ) + /\ + p1 = sum(i in 1..len)(used1[i]) + /\ + p2 = sum(i in 1..len)(used2[i]) + /\ + kept_sum = sum(i in 1..len)(kept[i]) + /\ + forall(k in 2..len)( + kept_r1_from_position[k] + (1-kept[k])*(len+1) > kept_r1_from_position[k-1] + /\ + kept_r2_from_position[k] + (1-kept[k])*(len+1) > kept_r2_from_position[k-1] + ) + /\ + forall(i in 1..len-1)( + kept[i+1] <= kept[i] + /\ + used1[i+1] <= used1[i] + /\ + used2[i+1] <= used2[i] + ) + /\ + forall(k in 1..len)( + kept[k]*len >= kept_r1_from_position[k] + /\ + kept[k]*len >= kept_r2_from_position[k] + /\ + kept[k] <= kept_r1_from_position[k] + /\ + kept[k] <= kept_r2_from_position[k] + ) + /\ + forall(k in 1..len)( + kept_r2_from_position[k] - (1-kept[k])*len <= p1 + /\ + kept_r1_from_position[k] - (1-kept[k])*len <= p2 + ) + /\ + forall(i in 1..len)( + sum(j in 0..len)(kept_r2_from_position_i[i,j]) == 1 + /\ + sum(j in 0..len)(kept_r2_from_position_i[i,j]*j) == kept_r2_from_position[i] + /\ + sum(j in 0..len)(kept_r1_from_position_i[i,j]) == 1 + /\ + sum(j in 0..len)(kept_r1_from_position_i[i,j]*j) == kept_r1_from_position[i] + ) + /\ + forall(i in 1..len)( + z[i] <= 1 + (1-kept_r2_from_position_i[i,0])*max_char + ) + /\ + forall(i in 1..len, j in 1..len)( + z[i] - (1-kept[i])*max_char <= S1[j] + (1-kept_r2_from_position_i[i,j])*max_char + /\ + z[i] + (1-kept[i])*max_char >= S1[j] - (1-kept_r2_from_position_i[i,j])*max_char + ) + /\ + forall(i in 1..len, j in 1..len)( + z[i] - (1-kept[i])*max_char <= S2[j] + (1-kept_r1_from_position_i[i,j])*max_char + /\ + z[i] + (1-kept[i])*max_char >= S2[j] - (1-kept_r1_from_position_i[i,j])*max_char + ); + +% Linearized decomposition based on the paper "Constraint-Based Scheduling for Paint Shops in the Automotive Supply Industry", Winter and Musliu, ACM Transactions on Intelligent Systems and Technology, 2020. +predicate fzn_edit_distance_kept_general(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: insertions, + var int: deletions, + var int: substitutions) = + let { int: len = length(S1); + var 0..len: p1; + var 0..len: p2; + var 0..1: p1_greater; + var 0..len: kept_sum; + array[1..len] of var 0..len: kept_r1_from_position; + array[1..len] of var 0..len: kept_r2_from_position; + array[1..len,0..len] of var 0..1: kept_r2_from_position_i; + array[1..len,0..len] of var 0..1: kept_r1_from_position_i; + array[1..len] of var 1..max_char: z; + array[1..len] of var 0..1: kept; + array[1..len] of var 0..1: kept_equal; + array[1..len] of var 0..1: used1; + array[1..len] of var 0..1: used2; + array[1..len] of var 0..max_char: kept_char; + } in + substitutions = sum(i in 1..len)(1-kept_equal[i]) + /\ + insertions >= p2 - kept_sum + /\ + deletions >= p1 - kept_sum + /\ + forall(i in 1..len)( + S1[i] <= max_char*used1[i] + /\ + used1[i] < 1 + S1[i] + /\ + S2[i] <= max_char*used2[i] + /\ + used2[i] < 1 + S2[i] + ) + /\ + p1 = sum(i in 1..len)(used1[i]) + /\ + p2 = sum(i in 1..len)(used2[i]) + /\ + kept_sum = sum(i in 1..len)(kept[i]) + /\ + forall(k in 2..len)( + kept_r1_from_position[k] + (1-kept[k])*(len+1) > kept_r1_from_position[k-1] + /\ + kept_r2_from_position[k] + (1-kept[k])*(len+1) > kept_r2_from_position[k-1] + ) + /\ + forall(i in 1..len-1)( + kept[i+1] <= kept[i] + /\ + used1[i+1] <= used1[i] + /\ + used2[i+1] <= used2[i] + ) + /\ + forall(k in 1..len)( + kept[k]*len >= kept_r1_from_position[k] + /\ + kept[k]*len >= kept_r2_from_position[k] + /\ + kept[k] <= kept_r1_from_position[k] + /\ + kept[k] <= kept_r2_from_position[k] + /\ + kept[k] >= 1-kept_equal[k] + ) + /\ + forall(k in 1..len)( + kept_r2_from_position[k] - (1-kept[k])*len <= p1 + /\ + kept_r1_from_position[k] - (1-kept[k])*len <= p2 + ) + /\ + forall(i in 1..len)( + sum(j in 0..len)(kept_r2_from_position_i[i,j]) == 1 + /\ + sum(j in 0..len)(kept_r2_from_position_i[i,j]*j) == kept_r2_from_position[i] + /\ + sum(j in 0..len)(kept_r1_from_position_i[i,j]) == 1 + /\ + sum(j in 0..len)(kept_r1_from_position_i[i,j]*j) == kept_r1_from_position[i] + ) + /\ + forall(i in 1..len)( + z[i] <= 1 + (1-kept_r2_from_position_i[i,0])*max_char + ) + /\ + forall(i in 1..len, j in 1..len)( + z[i] - (1-kept_equal[i])*max_char <= S1[j] + (1-kept_r2_from_position_i[i,j])*max_char + /\ + z[i] + (1-kept_equal[i])*max_char >= S1[j] - (1-kept_r2_from_position_i[i,j])*max_char + ) + /\ + forall(i in 1..len, j in 1..len)( + z[i] - (1-kept_equal[i])*max_char <= S2[j] + (1-kept_r1_from_position_i[i,j])*max_char + /\ + z[i] + (1-kept_equal[i])*max_char >= S2[j] - (1-kept_r1_from_position_i[i,j])*max_char + ); + +% Based on the MIP formulation proposed in the paper: +% "Finding median and center strings for a probability distribution on a set of strings under Levenshtein distance based on integer linear programming", Hayashida and Koyano, BIOSTEC 2016. +predicate fzn_edit_distance(int: max_char, + array[int] of int: W_ins, + array[int] of int: W_del, + array[int, int] of int: W_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { int: len = length(S1); + + array[int] of var int: O1 = array1d(1..len, S1); + array[int] of var int: O2 = array1d(1..len, S2); + array[int] of int: W_ins_0 = array1d(0..length(W_ins), [0] ++ W_ins); + int: max_ic = max(W_ins); + array[int] of int: W_del_0 = array1d(0..length(W_del), [0] ++ W_del); + int: max_dc = max(W_del); + array[int, int] of int: W_sub_0 = array2d(0..max_char, 0..max_char, + [ if i == 0 then W_ins_0[j] else if j == 0 then W_del_0[i] else W_sub[i,j] endif endif | i in 0..max_char, j in 0..max_char]); + int: max_sc = max(W_sub_0); + + array[0..len, 0..len] of var 0..1: x; + array[0..len, 0..len] of var 0..max_dc: x_c; + array[0..len, 0..len] of var 0..1: y; + array[0..len, 0..len] of var 0..max_ic: y_c; + array[0..len, 0..len] of var 0..1: z; + + array[1..len, 1..len] of var 0..1: g; + array[1..len, 1..len] of var 0..1: h; + array[1..len, 1..len] of var 0..max_sc: h_c; + + + array[1..len] of var 0..1: used1; + array[1..len] of var 0..1: used2; + } in + % see which positions are used (i.e. positions which are not 0) + forall(i in 1..len)( + O1[i] <= max_char*used1[i] + /\ + used1[i] < 1 + O1[i] + ) + /\ + forall(i in 1..len)( + O2[i] <= max_char*used2[i] + /\ + used2[i] < 1 + O2[i] + ) + /\ + % check that strings are correctly aligned + len = len + /\ + forall(i in 1..len-1)( + used1[i+1] <= used1[i] + ) + /\ + forall(i in 1..len-1)( + used2[i+1] <= used2[i] + ) + /\ + % a1 + 1 = x[1,0] + y[1,0] + z[1,1] + /\ + % a2 + forall(i in 1..len-1)( + x[i,0] = x[i+1,0] + y[i,1] + z[i+1,1] + ) + /\ + % a3 + x[len,0] = y[len,1] + /\ + % a4 + forall(j in 1..len-1)( + y[0,j] = x[1,j] + y[0,j+1] + z[1,j+1] + ) + /\ + % a5 + y[0,len] = x[1,len] + /\ + % a6 + forall(i in 1..len-1, j in 1..len-1)( + x[i,j] + y[i,j] + z[i,j] = x[i+1,j] + y[i,j+1] + z[i+1,j+1] + ) + /\ + % a7 + forall(j in 1..len-1)( + x[len,j] + y[len,j] + z[len,j] = y[len,j+1] + ) + /\ + % a8 + forall(i in 1..len-1)( + x[i,len] + y[i,len] + z[i,len] = x[i+1,len] + ) + /\ + % a9 + x[len,len] + y[len,len] + z[len,len] = 1 + /\ + % special 0 length string constraint, we need to force a variable in this case + y[0,len] * len >= len * (1-len) + /\ + % b + forall(j in 1..len)( + y[len,j] >= (1/len) * (j-len) + ) + /\ + % c1 + forall(i in 1..len, j in 1..len)( + O1[i] - O2[j] <= max_char * g[i,j] + ) + /\ + % c2 + forall(i in 1..len, j in 1..len)( + O2[j] - O1[i] <= max_char * g[i,j] + ) + /\ + % d2 + forall(i in 1..len, j in 1..len)( + h[i,j] >= z[i,j] + g[i,j] - 1 + ) + /\ + forall(i in 1..len, j in 1..len)( + h[i,j] <= 1/2 * (z[i,j] + g[i,j]) + ) + /\ + % if the empty character zero is used, we forbid substitution + forall(i in 1..len, j in 1..len)( + O1[i] >= h[i,j] + /\ + O2[j] >= h[i,j] + ) + /\ + % linearize deletion costs + forall(i in 1..len, j in 0..len)( + x_c[i,j] >= W_del_0[O1[i]] - (1-x[i,j])*max_dc + ) + /\ + % linearize insertion costs + forall(i in 0..len, j in 1..len)( + y_c[i,j] >= W_ins_0[O2[j]] - (1-y[i,j])*max_ic + ) + /\ + % linearize substitution costs + forall(i in 1..len, j in 1..len)( + h_c[i,j] >= W_sub_0[O1[i],O2[j]] - (1-h[i,j])*max_sc + ) + /\ + ED = sum(i in 1..len)(x_c[i,0]) + + sum(j in 1..len)(y_c[0,j]) + + sum(i in 1..len, j in 1..len)(x_c[i,j] + y_c[i,j] + h_c[i,j]); \ No newline at end of file diff --git a/share/minizinc/std/edit_distance.mzn b/share/minizinc/std/edit_distance.mzn new file mode 100644 index 000000000..20d4a0aec --- /dev/null +++ b/share/minizinc/std/edit_distance.mzn @@ -0,0 +1,181 @@ +include "fzn_edit_distance.mzn"; +include "fzn_edit_distance_exact.mzn"; + +/** @group globals + Constrains \a ED to be greater than or equal to the minimum edit distance between two strings \a S1 and \a S2. + Any character insertion or deletion has a cost of 1, whereas any character substitution has a cost of 2. + All characters need to be numbered from 1 to \a max_char. + The integer arrays \a S1 and \a S2 need to have the same length, but may contain zeros at the end to indicate an "empty" character. + Example: Let \a max_char = 2, \a S1 = [1,1,0,0,0], \a S2 = [1,2,1,0,0], then \a ED = 1, as the minimum edit distance is 1 between the strings "11" and "121" (a single '2' character needs to be inserted). + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be >= than the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + assert(length(S1) == length(S2), + "edit_distance: string arrays S1 and S2 need to have the same length", + assert(max_char >= 1, + "edit_distance: max_char must be >= 1", + fzn_edit_distance(max_char, S1, S2, ED) + )); + +/** @group globals + Constrains \a ED to be greater than or equal to the minimum edit distance between two strings \a S1 and \a S2. + This constraint functions in the same way as the constraint \a edit_distance(max_char, S1, S2, ED), however it additionally specifies custom character insertion, deletion and substitution costs. + All costs should be >= 1 and the following inequality must hold: C_sub <= C_ins + C_del + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param C_ins: Costs for a single character insertion +@param C_del: Costs for a single character deletion +@param C_sub: Costs for a single character substitution +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be >= than the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance(int: max_char, + int: C_ins, + int: C_del, + int: C_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + assert(length(S1) == length(S2), + "edit_distance: string arrays S1 and S2 need to have the same length", + assert(max_char >= 1, + "edit_distance: max_char must be >= 1", + assert(C_sub <= C_ins + C_del, + "edit_distance: The following inequality must hold C_sub <= C_ins + C_del", + assert(C_sub >= 1 /\ C_ins >= 1 /\ C_del >= 1, + "edit_distance: C_sub, C_ins, C_del all must be >= 1", + fzn_edit_distance(max_char, C_ins, C_del, C_sub, S1, S2, ED) + )))); + +/** @group globals + Constrains \a ED to be greater than or equal to the minimum edit distance between two strings \a S1 and \a S2. + This constraint functions in the same way as the constraint \a edit_distance(max_char, S1, S2, ED), however it additionally specifies custom character insertion, deletion and substitution costs for each individual combination of characters. + All costs should be >= 1, except the substitution costs of a character with itself which should be 0 (W_sub[c,c] = 0). + Furthermore, the following inequality must hold for all pairs of characters c1 and c2: \a W_sub[c1,c2] <= \a W_del[c1] + \a W_ins[c2] + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param W_ins: Integer array containing the costs for single character insertions +@param W_del: Integer array containing the costs for single character deletions +@param W_sub: Integer array containing the costs for single character substitutions +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be >= than the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance(int: max_char, + array[int] of int: W_ins, + array[int] of int: W_del, + array[int, int] of int: W_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + assert(length(S1) == length(S2), + "edit_distance: string arrays S1 and S2 need to have the same length", + assert(index_set(W_ins) == 1..max_char /\ index_set(W_del) == 1..max_char, + "edit_distance: index sets of W_ins and W_del must be 1..max_char", + assert(index_set_1of2(W_sub) == 1..max_char /\ index_set_2of2(W_sub) == 1..max_char, + "edit_distance: both index sets of W_sub must be 1..max_char", + assert(max_char >= 1, + "edit_distance: max_char must be >= 1", + assert(forall(c1 in 1..max_char, c2 in 1..max_char)(W_sub[c1,c2] <= W_ins[c2] + W_del[c1]), + "edit_distance: The following inequality must hold for all c1, c2 in 1..max_char: W_sub[c1,c2] <= W_ins[c2] + W_del[c1]", + assert(forall(c in 1..max_char)(W_ins[c] >= 1 /\ W_del[c] >= 1), + "edit_distance: for all c in 1..max_char: W_ins[c] and W_del[c] must be >= 1", + assert(forall(c1 in 1..max_char, c2 in 1..max_char where c1 != c2)(W_sub[c1,c2] >= 1), + "edit_distance: for all c1, c2 in 1..max_char where c1 != c2: W_sub[c1,c2] must be >= 1", + assert(forall(c in 1..max_char)(W_sub[c,c] == 0), + "edit_distance: for all c in 1..max_char: W_sub[c,c] must be == 0", + fzn_edit_distance(max_char, W_ins, W_del, W_sub, S1, S2, ED) + )))))))); + +/** @group globals + Constrains \a ED to be equal to the minimum edit distance between two strings \a S1 and \a S2 (Note that this constraint is likely to be less efficient than its "edit_distance" version when you are aiming to minimize the edit distance). + Any character insertion or deletion has a cost of 1, whereas any character substitution has a cost of 2. + All characters need to be numbered from 1 to \a max_char. + The integer arrays \a S1 and \a S2 need to have the same length, but may contain zeros at the end to indicate an "empty" character. + Example: Let \a max_char = 2, \a S1 = [1,1,0,0,0], \a S2 = [1,2,1,0,0], then \a ED = 1, as the minimum edit distance is 1 between the strings "11" and "121" (a single '2' character needs to be inserted). + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be equal to the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance_exact(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + edit_distance_exact(max_char, 1, 1, 2, S1, S2, ED); + +/** @group globals + Constrains \a ED to be equal to the minimum edit distance between two strings \a S1 and \a S2 (Note that this constraint is likely to be less efficient than its "edit_distance" version when you are aiming to minimize the edit distance). + This constraint functions in the same way as the constraint \a edit_distance_exact(max_char, S1, S2, ED), however it additionally specifies custom character insertion, deletion and substitution costs. + All costs should be >= 1 and the following inequality must hold: C_sub <= C_ins + C_del + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param C_ins: Costs for a single character insertion +@param C_del: Costs for a single character deletion +@param C_sub: Costs for a single character substitution +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be equal to the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance_exact(int: max_char, + int: C_ins, + int: C_del, + int: C_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + assert(C_sub <= C_ins + C_del, + "edit_distance: The following inequality must hold C_sub <= C_ins + C_del", + assert(C_sub >= 1 /\ C_ins >= 1 /\ C_del >= 1, + "edit_distance: C_sub, C_ins, C_del all must be >= 1", + edit_distance_exact(max_char, [C_ins | i in 1..max_char], [C_del | i in 1..max_char], array2d(1..max_char, 1..max_char, [if i == j then 0 else C_sub endif | i in 1..max_char, j in 1..max_char]), S1, S2, ED) + )); + +/** @group globals + Constrains \a ED to be equal to the minimum edit distance between two strings \a S1 and \a S2 (Note that this constraint is likely to be less efficient than its "edit_distance" version when you are aiming to minimize the edit distance). + This constraint functions in the same way as the constraint \a edit_distance_exact(max_char, S1, S2, ED), however it additionally specifies custom character insertion, deletion and substitution costs for each individual combination of characters. + All costs should be >= 1, except the substitution costs of a character with itself which should be 0 (W_sub[c,c] = 0). + Furthermore, the following inequality must hold for all pairs of characters c1 and c2: \a W_sub[c1,c2] <= \a W_del[c1] + \a W_ins[c2] + +@param max_char: The maximum character that can occur in a string (all characters should be numbered from 1..max_char, 0 represents an empty character) +@param W_ins: Integer array containing the costs for single character insertions +@param W_del: Integer array containing the costs for single character deletions +@param W_sub: Integer array containing the costs for single character substitutions +@param S1: Integer array that represents the first string +@param S2: Integer array that represents the second string +@param ED: Variable that is constrained to be equal to the minimum edit distance between \a S1 to \a S2 +*/ +predicate edit_distance_exact(int: max_char, + array[int] of int: W_ins, + array[int] of int: W_del, + array[int, int] of int: W_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + assert(length(S1) == length(S2), + "edit_distance: string arrays S1 and S2 need to have the same length", + assert(index_set(W_ins) == 1..max_char /\ index_set(W_del) == 1..max_char, + "edit_distance: index sets of W_ins and W_del must be 1..max_char", + assert(index_set_1of2(W_sub) == 1..max_char /\ index_set_2of2(W_sub) == 1..max_char, + "edit_distance: both index sets of W_sub must be 1..max_char", + assert(max_char >= 1, + "edit_distance: max_char must be >= 1", + assert(forall(c1 in 1..max_char, c2 in 1..max_char)(W_sub[c1,c2] <= W_ins[c2] + W_del[c1]), + "edit_distance: The following inequality must hold for all c1, c2 in 1..max_char: W_sub[c1,c2] <= W_ins[c2] + W_del[c1]", + assert(forall(c in 1..max_char)(W_ins[c] >= 1 /\ W_del[c] >= 1), + "edit_distance: for all c in 1..max_char: W_ins[c] and W_del[c] must be >= 1", + assert(forall(c1 in 1..max_char, c2 in 1..max_char where c1 != c2)(W_sub[c1,c2] >= 1), + "edit_distance: for all c1, c2 in 1..max_char where c1 != c2: W_sub[c1,c2] must be >= 1", + assert(forall(c in 1..max_char)(W_sub[c,c] == 0), + "edit_distance: for all c in 1..max_char: W_sub[c,c] must be == 0", + fzn_edit_distance_exact(max_char, W_ins, W_del, W_sub, S1, S2, ED) + )))))))); \ No newline at end of file diff --git a/share/minizinc/std/fzn_edit_distance.mzn b/share/minizinc/std/fzn_edit_distance.mzn new file mode 100644 index 000000000..dff4778f7 --- /dev/null +++ b/share/minizinc/std/fzn_edit_distance.mzn @@ -0,0 +1,131 @@ +include "fzn_edit_distance_exact.mzn"; + +predicate fzn_edit_distance(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { + int: len = length(S1); + var 0..len: p1; + var 0..len: p2; + var 0..len: kept_sum; + int: l1 = min(index_set(S1)); + int: l2 = min(index_set(S2)); + int: u1 = max(index_set(S1)); + int: u2 = max(index_set(S2)); + int: na1 = l1-1; + int: na2 = l2-1; + array[1..len] of var na2..u2: kept_r1_from_position; + array[1..len] of var na1..u1: kept_r2_from_position; + } in + ED = p1 + p2 - 2 * kept_sum + /\ + forall(i in 1..len where kept_r1_from_position[i] != na2) ( + S1[kept_r2_from_position[i]] = S2[kept_r1_from_position[i]] + ) + /\ + fzn_edit_distance_kept(max_char, S1, S2, p1, p2, kept_r1_from_position, kept_r2_from_position, kept_sum); + +predicate fzn_edit_distance(int: max_char, + int: C_ins, + int: C_del, + int: C_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { int: len = length(S1); + var 0..len: p1; + var 0..len: p2; + var 0..len: kept_sum; + var 0..len: insertions; + var 0..len: deletions; + var 0..len: substitutions; + int: l1 = min(index_set(S1)); + int: l2 = min(index_set(S2)); + int: u1 = max(index_set(S1)); + int: u2 = max(index_set(S2)); + int: na1 = l1-1; + int: na2 = l2-1; + array[1..len] of var na2..u2: kept_r1_from_position; + array[1..len] of var na1..u1: kept_r2_from_position; + } in + substitutions = count(i in 1..len)(kept_r1_from_position[i] != na2 /\ S1[kept_r2_from_position[i]] != S2[kept_r1_from_position[i]]) + /\ + insertions = p2 - kept_sum + /\ + deletions = p1 - kept_sum + /\ + ED = C_ins * insertions + C_sub * substitutions + C_del * deletions + /\ + fzn_edit_distance_kept(max_char, S1, S2, p1, p2, kept_r1_from_position, kept_r2_from_position, kept_sum); + +% Decomposition from the paper "Constraint-Based Scheduling for Paint Shops in the Automotive Supply Industry", Winter and Musliu, ACM Transactions on Intelligent Systems and Technology, 2020. +predicate fzn_edit_distance_kept(int: max_char, + array[int] of var int: S1, + array[int] of var int: S2, + var int: p1, + var int: p2, + array[int] of var int: kept_r1_from_position, + array[int] of var int: kept_r2_from_position, + var int: kept_sum) = + let { int: l1 = min(index_set(S1)); + int: l2 = min(index_set(S2)); + int: u1 = max(index_set(S1)); + int: u2 = max(index_set(S2)); + int: len = length(S1); + int: na1 = l1-1; + int: na2 = l2-1; + var 0..len: insertions; + var 0..len: deletions; + var 0..len: substitutions; + } in + p1 = sum(i in l1..u1 where S1[i] != 0)(1) + /\ + p2 = sum(i in l2..u2 where S2[i] != 0)(1) + /\ + kept_sum = sum(i in 1..len where kept_r1_from_position[i] != na2)(1) + /\ + forall(i in l1..u1-1)( + S1[i] == 0 -> S1[i+1] == 0 + ) + /\ + forall(i in l2..u2-1)( + S2[i] == 0 -> S2[i+1] == 0 + ) + /\ + forall(i in 2..len where kept_r1_from_position[i] != na2) ( + kept_r1_from_position[i] > kept_r1_from_position[i-1] + ) + /\ + forall(i in 2..len where kept_r2_from_position[i] != na1) ( + kept_r2_from_position[i] > kept_r2_from_position[i-1] + ) + /\ + forall(i in 1..len) ( + kept_r1_from_position[i] > na2 <-> kept_r2_from_position[i] > na1 + ) + /\ + forall(i in 1..len-1) ( + (kept_r1_from_position[i] = na2) -> (kept_r1_from_position[i+1] = na2) + ) + /\ + forall(i in 1..len-1) ( + (kept_r2_from_position[i] = na1) -> (kept_r2_from_position[i+1] = na1) + ) + /\ + forall(i in 1..len where kept_r2_from_position[i] != na1) ( + kept_r2_from_position[i] <= l1-1 + p1 + ) + /\ + forall(i in 1..len where kept_r1_from_position[i] != na2) ( + kept_r1_from_position[i] <= l2-1 + p2 + ); + +predicate fzn_edit_distance(int: max_char, + array[int] of int: W_ins, + array[int] of int: W_del, + array[int, int] of int: W_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + fzn_edit_distance_exact(max_char, W_ins, W_del, W_sub, S1, S2, ED); \ No newline at end of file diff --git a/share/minizinc/std/fzn_edit_distance_exact.mzn b/share/minizinc/std/fzn_edit_distance_exact.mzn new file mode 100644 index 000000000..81c5bc615 --- /dev/null +++ b/share/minizinc/std/fzn_edit_distance_exact.mzn @@ -0,0 +1,60 @@ +% Dynamic programming routine decomposition for insertion/deletion/substition cost defined by character +% DP routine first proposed in the paper "The String-to-String Correction Problem", Wagner and Fischer, Journal of the ACM, 1978. +predicate fzn_edit_distance_exact(int: max_char, + array[int] of int: W_ins, + array[int] of int: W_del, + array[int, int] of int: W_sub, + array[int] of var int: S1, + array[int] of var int: S2, + var int: ED) = + let { int: l1 = min(index_set(S1)) - 1; + int: l2 = min(index_set(S2)) - 1; + int: u1 = max(index_set(S1)); + int: u2 = max(index_set(S2)); + array[int] of int: W_ins_0 = array1d(0..length(W_ins), [0] ++ W_ins); + array[int] of int: W_del_0 = array1d(0..length(W_del), [0] ++ W_del); + array[int, int] of int: W_sub_0 = array2d(0..max_char, 0..max_char, + [ if i == 0 then W_ins_0[j] else if j == 0 then W_del_0[i] else W_sub[i,j] endif endif | i in 0..max_char, j in 0..max_char]); + array[l1..u1,l2..u2] of var int: T; + } in + forall(i in l1+1..u1-1)( + S1[i] == 0 -> S1[i+1] == 0 + ) + /\ + forall(i in l2+1..u2-1)( + S2[i] == 0 -> S2[i+1] == 0 + ) + /\ + T[l1,l2] = 0 /\ + T[u1,u2] = ED /\ + forall(i in l1+1..u1, j in l2+1..u2)( + T[i,j] >= 0 /\ T[i,j] <= sum(i2 in l1+1..i)(W_del_0[S1[i2]]) + sum(j2 in l2+1..j)(W_ins_0[S2[j2]]) + ) + /\ + forall(j in l2+1..u2)( + T[l1,j] = sum(j2 in l2+1..j)(W_ins_0[S2[j2]]) + ) + /\ + forall(i in l1+1..u1)( + T[i,l2] = sum(i2 in l1+1..i)(W_del_0[S1[i2]]) + ) + /\ + forall(i in l1+1..u1, j in l2+1..u2)( + T[i,j] = + if S1[i] = S2[j] then + T[i-1,j-1] + else + if S1[i] = 0 then + T[i-1,j] + else + if S2[j] = 0 then + T[i,j-1] + else + min([T[i-1,j]+W_del_0[S1[i]], + T[i,j-1]+W_ins_0[S2[j]], + T[i-1,j-1]+W_sub_0[S1[i], S2[j]] + ]) + endif + endif + endif + ); \ No newline at end of file