-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_candidate_genes.py
executable file
·202 lines (154 loc) · 6.73 KB
/
get_candidate_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#! /usr/bin/python
#-------------------------------------------------------------------------------
def extract_GO_from_b2go(annot):
"""
return a dictionnary from an b2go.annot file
keys: sequence names
values: corresponding GO terms list
"""
go_dict = {}
annot_dict = {}
with open(annot, "r") as f:
for line in f:
line = line.split()
seq_id = line[0]
go = line[1]
# For now, skip the "EC" annotation
if go.startswith("EC:"):
continue
if seq_id in go_dict:
go_dict[ seq_id ].append(go)
else:
go_dict[ seq_id ] = [ go ]
if line[2:]: # if there is an annotation
annot_dict[ seq_id ] = " ".join(line[2:])
return go_dict, annot_dict
#-------------------------------------------------------------------------------
def get_all_annot(annot_files_lst):
"""
Get annotations from a list of files and return a dictionnary with:
keys: file name
values: list of annotations
"""
import os
total_annot = {}
for f in annot_files_lst:
go_dict, annot_dict = extract_GO_from_b2go(f)
fname = os.path.basename(f)
total_annot[fname] = annot_dict.values()
return total_annot
#-------------------------------------------------------------------------------
def filter_single_annots(total_annot):
"""
Filter a total_annot dictionnary to keep only annotations which are appearing
only once within a sample
"""
from collections import Counter
for n, annot in total_annot.items():
single_annots = [a for a, c in Counter(annot).items() if c == 1 ]
total_annot[n] = single_annots
return total_annot
#-------------------------------------------------------------------------------
def get_common_annots(total_annot):
"""
From a total_annot dictionnary (filtered or not)),
check which annotations are common to all
"""
from collections import Counter
# Group all annotations to a single list:
# set(j) in case filter_single_annots has not been used
annots = [ i for j in total_annot.values() for i in set(j) ]
# Get annots common to all (with a count equal to number of different samples)
common_a = [ a for a, c in Counter(annots).items() if c == len(total_annot)]
return common_a
#-------------------------------------------------------------------------------
def print_common_genes_summary(annot_files_lst):
"""
Print a report about the annotations common to all files in a annot_files_lst
"""
total_annot = get_all_annot(annot_files_lst)
total_annot = filter_single_annots(total_annot)
common_a = get_common_annots(total_annot)
for annot in common_a:
print annot
print "Total common annotations: %d" % len(common_a)
#-------------------------------------------------------------------------------
def get_seq_names_by_annot(annot_lst, annot_dict):
"""
Return the sequences names corresponding to the annotations in annot_lst
provided in the annot_dict with the correspondance between seq names and annot
"""
# Filter annot dict to get only the sequences with the annots in annot_lst
# As well reverse the dict to get annotations as keys
rev_annot = {annot_dict[k]: k for k in annot_dict
if annot_dict[k] in annot_lst }
return rev_annot
#-------------------------------------------------------------------------------
def fasta_2_dict(fasta_file):
"""
Get a dictionnary from fasta file with:
key = seq_id
value = seq
WORKING
"""
with open(fasta_file, "r") as f:
seq_dict = {}
fasta = f.read().split('>')[1:]
# split each time a ">" in encountered
tmp = [ seq.partition("\n") for seq in fasta ]
# split the strings in 3 items tuple: seq id, the sep, and the seq
for i in range( len( tmp ) ):
seq_dict[ tmp[i][0].split()[0] ] = tmp[i][2].replace( "\n", "").upper()
# build a dictionnary with key=seq id, value=seq
# made for trinity output: split()[0] removes the len, path infos
return seq_dict
#-------------------------------------------------------------------------------
def get_common_seqs(annot_files_lst, fastas_lst, outfolder):
"""
From a list of annotations files and of fasta files (in the SAME ORDER)
get a folder with a files with the sequences corresponding to an annotation
common to all samples.
"""
import os
total_annot = get_all_annot(annot_files_lst)
total_annot = filter_single_annots(total_annot)
common_a = get_common_annots(total_annot)
# Get annotations for each files and the seqnames corresponding to the common
# annotations
for fin, fasta in zip(annot_files_lst, fastas_lst):
sample = os.path.basename(fasta)
sample = os.path.splitext(sample)[0]
print fasta
go_dict, annot_dict = extract_GO_from_b2go(fin)
rev_annot = get_seq_names_by_annot(common_a, annot_dict)
seq_dict = fasta_2_dict(fasta)
# Print out in a file for each annotation
for annot in rev_annot:
fname = annot.replace(" ", "_")
fname = fname.replace("-", "_")
seq_n = rev_annot[annot]
with open(outfolder + fname, "a") as fout:
fout.write( ">" + seq_n + "_" + sample + "\n")
fout.write( seq_dict[seq_n] + "\n")
# print get_all_seq_names_by_annot(common_a, annot_files_lst)
#-------------------------------------------------------------------------------
if __name__ == "__main__":
import sys
print "EDIT THE PROGRAM BEFORE LAUNCHING"
# sys.exit()
annots = [ "/home/tiennou/Desktop/transit/beetles/AP1.annot",
"/home/tiennou/Desktop/transit/beetles/BB1.annot",
"/home/tiennou/Desktop/transit/beetles/NC2.annot",
"/home/tiennou/Desktop/transit/beetles/OO.annot",
"/home/tiennou/Desktop/transit/beetles/thor.annot",
"/home/tiennou/Desktop/transit/beetles/TM2.annot"
]
fastas = [ "/home/tiennou/Desktop/transit/beetles/AP1_clus95.fas",
"/home/tiennou/Desktop/transit/beetles/BB1_clus95.fas",
"/home/tiennou/Desktop/transit/beetles/NC2_clus95.fas",
"/home/tiennou/Desktop/transit/beetles/OO_clus95.fas",
"/home/tiennou/Desktop/transit/beetles/Thor_clus95.fas",
"/home/tiennou/Desktop/transit/beetles/TM2_clus95.fas"
]
outfolder = "test3/"
get_common_seqs(annots, fastas, outfolder)