-
Notifications
You must be signed in to change notification settings - Fork 0
/
aligner_daewoo.py
139 lines (117 loc) · 4.94 KB
/
aligner_daewoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from random import shuffle
#fixme must go through a unit test
def get_permuted_list(sequence_list):
permuted_list = []
for sequence in sequence_list:
s = list(sequence)
shuffle(s)
permuted_list.append(''.join(s))
return permuted_list
def get_permutated_sequence_distance(subject_list, query_list):
subject_permutations = get_permuted_list(subject_list)
query_permutations = get_permuted_list(query_list)
permutation_scores = []
index = 0
while index < len(subject_list):
permutation_scores.append(get_score(subject_permutations[index], query_permutations[index]))
index += 1
return get_evolutionary_distance(permutation_scores, len(subject_permutations[0]))
def get_score(subject, query):
"""
assumes that the two sequences are the same length
:param subject: string coming from seq list 1. What we are trying to prove to be homologous
:param query: string coming from seq list 2. What our subject is homologous to
:return: the score based on number of mismatch (+1)
"""
mismatches = 0
for index, base in enumerate(subject):
if base != query[index]:
mismatches += 1
return mismatches
def get_evolutionary_distance(score_list, m):
score_sum = 0
for score in score_list:
score_sum += (score / m) ** 2
return score_sum ** .5
def find_all_distances(subject_dict, query_dict): # fixme
distances = []
original_scores = []
subject_sequences = []
query_sequences = []
for key in subject_dict.keys():
for subject in subject_dict[key]:
if len(query_dict[key]) != 0:
subject_sequences.append(subject)
for query in query_dict[key]:
query_sequences.append(query)
for n, subject in enumerate(subject_sequences):
original_scores.append(get_score(subject, query_sequences[n]))
distances.append(get_evolutionary_distance(original_scores, len(subject_sequences[0])))
counter = 0
while counter < 1000:
distances.append(get_permutated_sequence_distance(subject_sequences, query_sequences))
counter += 1
print(counter)
return distances
def find_p_value(distance_list): # fixme
counter = 0
original_distance = distance_list[0]
for dist in distance_list:
if dist < original_distance:
counter += 1
return (counter + 1) / (len(distance_list)) # no +1 since original dist is in the dist list too
"""
def same_tRNA_input():
subject_list = []
query_dict = {}
scores = []
with open(inputFile1, 'r') as input1:
copied_list = []
for line in input1:
copied_list.append(line)
for i, line in enumerate(copied_list):
if line[0:1] == 'C' or line[0:1] == 'G' or line[0:1] == 'A' or line[0:1] == 'T' or line[0:1] == 'U':
subject_list.append(line.replace('\n', ''))
tRNA_index_list.append(copied_list[i - 1])
with open(inputFile2, 'r') as input2:
copied_list = []
for line in input2:
copied_list.append(line)
for i, line in enumerate(copied_list):
if (copied_list[i - 1] in tRNA_index_list) and (line[0:1] == 'C' or line[0:1] == 'G' or line[0:1] == 'A' or line[0:1] == 'T' or line[0:1] == 'U'):
sequence_list2.append(line.replace('\n', ''))
"""
user_input1 = input('Enter the name of the first input file.\n')
user_input2 = input('Enter the name of the second input file.\n')
inputFile1 = 'C:\\Users\\Yunsoo\\Desktop\\research\\' + user_input1
inputFile2 = 'C:\\Users\\Yunsoo\\Desktop\\research\\' + user_input2
outputFile = 'C:\\Users\\Yunsoo\\Desktop\\research\\sa_daewoo_results\\' + user_input1[:-4] +'_vs_' + user_input2[:-4] + '.txt'
subject_dict = {}
query_dict = {}
scores = []
with open(inputFile1, 'r') as input1:
copied_list = []
for line in input1:
copied_list.append(line)
for i, line in enumerate(copied_list):
if line[0:1] == '>' and len(copied_list[i + 1]) == len(copied_list[1]):
if not line in subject_dict:
subject_dict[line] = []
subject_dict[line].append(copied_list[i + 1].replace('\n', ''))
else:
subject_dict[line].append(copied_list[i + 1].replace('\n', ''))
query_dict[line] = []
with open(inputFile2, 'r') as input2:
copied_list = []
for line in input2:
copied_list.append(line)
for i, line in enumerate(copied_list):
if line in query_dict:
query_dict[line].append(copied_list[i + 1].replace('\n', ''))
with open(outputFile, 'w') as output:
distances = find_all_distances(subject_dict, query_dict)
p_value = find_p_value(distances)
output.write("{0:.5f}\n".format(p_value))
for dist in distances:
output.write("{0:.3f}\n".format(dist))
print(p_value)