-
Notifications
You must be signed in to change notification settings - Fork 49
/
alignment.py
166 lines (141 loc) · 5.29 KB
/
alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#This software is a free software. Thus, it is licensed under GNU General Public License.
#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>
# zeros() was origianlly from NumPy.
# This version is implemented by alevchuk 2011-04-10
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award = 10
mismatch_penalty = -5
gap_penalty = -5 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1 = align1[::-1] #reverse sequence 1
align2 = align2[::-1] #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
symbol = ''
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:
symbol = symbol + align1[i]
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
score += match_score(align1[i], align2[i])
symbol += ' '
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':
symbol += ' '
score += gap_penalty
identity = float(identity) / len(align1) * 100
print 'Identity =', "%3.3f" % identity, 'percent'
print 'Score =', score
print align1
print symbol
print align2
def needle(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment
align1, align2 = '', ''
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1 += seq1[i-1]
align2 += '-'
i -= 1
elif score_current == score_up + gap_penalty:
align1 += '-'
align2 += seq2[j-1]
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1 += seq1[i-1]
align2 += '-'
i -= 1
while j > 0:
align1 += '-'
align2 += seq2[j-1]
j -= 1
finalize(align1, align2)
def water(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
pointer = zeros((m+1, n+1)) # to store the traceback path
max_score = 0 # initial maximum score in DP table
# Calculate DP table and mark pointers
for i in range(1, m + 1):
for j in range(1, n + 1):
score_diagonal = score[i-1][j-1] + match_score(seq1[i-1], seq2[j-1])
score_up = score[i][j-1] + gap_penalty
score_left = score[i-1][j] + gap_penalty
score[i][j] = max(0,score_left, score_up, score_diagonal)
if score[i][j] == 0:
pointer[i][j] = 0 # 0 means end of the path
if score[i][j] == score_left:
pointer[i][j] = 1 # 1 means trace up
if score[i][j] == score_up:
pointer[i][j] = 2 # 2 means trace left
if score[i][j] == score_diagonal:
pointer[i][j] = 3 # 3 means trace diagonal
if score[i][j] >= max_score:
max_i = i
max_j = j
max_score = score[i][j];
align1, align2 = '', '' # initial sequences
i,j = max_i,max_j # indices of path starting point
#traceback, follow pointers
while pointer[i][j] != 0:
if pointer[i][j] == 3:
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif pointer[i][j] == 2:
align1 += '-'
align2 += seq2[j-1]
j -= 1
elif pointer[i][j] == 1:
align1 += seq1[i-1]
align2 += '-'
i -= 1
finalize(align1, align2)