forked from zzz2010/5UTR_Optimizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
finalFormat_3k_synthetic_seqs.py
52 lines (45 loc) · 1.41 KB
/
finalFormat_3k_synthetic_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os,sys
import glob
import math
input_dir=sys.argv[1] #./output
Ntotal=3585
if len(sys.argv)>2:
Ntotal=int(sys.argv[2])
allFiles=glob.glob(input_dir+"/*.sel")
model_ostr_score=dict()
visited=set()
#gencode_v17_5utr_15bpcds.fa.pc3.galog.62355
#gencode_v17_5utr_15bpcds.fa.pc3.claudia_seq.gaRibo.13167#
Nprinted=0
print("seq score model generation info")
for fn in allFiles:
label=os.path.basename(fn).replace("gencode_v17_5utr_15bpcds.fa.","").replace(".claudia_seq","").replace(".gaRibo","_Ribo").replace(".galog","_TE").replace(".final","").split(".")[0]
for line in open(fn):
comps=line.strip().split()
seq="TAAACTTAAGCTTGGTACCG"+comps[1]+"GCCACCATGGTGAGCAAGGG"
if seq in visited:
continue
visited.add(seq)
score=comps[2]
if score=="NA":
continue
itera=comps[0]
info=comps[3]
outstr=seq+"\t"+score+"\t"+label+"\t"+itera+"\t"+info
if info.startswith("best"):
initBestScore=float(info.split("|")[1])
if float(score)<initBestScore+0.05:
continue
if label not in model_ostr_score:
model_ostr_score[label]=dict()
model_ostr_score[label][outstr]=float(score)
else:
print(outstr)
Nprinted+=1
import operator
perModelNum=int(math.ceil(float(Ntotal-Nprinted)/len(model_ostr_score)))
for model in model_ostr_score:
x=model_ostr_score[model]
sorted_x = sorted(x.items(), key=operator.itemgetter(1),reverse=True)
for i in range(min(perModelNum,len(sorted_x))):
print(sorted_x[i][0])