-
Notifications
You must be signed in to change notification settings - Fork 0
/
basics_fasta.py
executable file
·119 lines (98 loc) · 4.09 KB
/
basics_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import basics_nuc_seq as bns
import biographs as biog
import numpy
import random
#-------------------------------------------------------------------------------
def fasta_2_dict(fas_file, simple_ID=True):
"""
Get a dictionnary from fasta file with:
key = seq_id
value = seq
If simple_ID is True, then the seq_id in the dictionnary will be
the original one truncated after the first white space.
WORKING
"""
with open(fas_file, "r") as f:
seq_d = {}
# split each time a ">" in encountered
fasta = f.read().split('>')[1:]
# split the strings in 3 items tuple: seq id, the sep, and the seq
tmp = [ seq.partition("\n") for seq in fasta ]
# build a dictionnary with key=seq id, value=seq
for i in range( len( tmp ) ):
if simple_ID:
# for ex: for trinity output, split()[0] removes the len, path infos
seq_d[ tmp[i][0].split()[0] ] = tmp[i][2].replace( "\n", "").upper()
else:
seq_d[ tmp[i][0] ] = tmp[i][2].replace( "\n", "").upper()
# Check if no empty sequences
empty_seqs = [k for k in seq_d if len(seq_d[k]) == 0]
if empty_seqs:
raise IOError("Something seems wrong with this sequence:\n"
+ "\n".join(empty_seqs))
return seq_d
#-------------------------------------------------------------------------------
def filter_by_id(seq_d, seq_id_lst, startonly=False):
"""
Yield tuples (seq_id, seq) from a LIST of selected ids
"""
if not startonly:
for seq_id in seq_id_lst:
yield (seq_id, seq_d[seq_id])
else:
for seq_id in seq_id_lst:
for dictID in seq_d:
if dictID.startswith(seq_id):
yield (dictID, seq_d[dictID])
#-------------------------------------------------------------------------------
def filter_by_len(seq_d, len_thres):
"""
Yield tuples (seq_id,seq) of sequences with length >= len_thres
"""
for seq_id in seq_d:
if len(seq_d[ seq_id ]) >= int(len_thres):
yield (seq_id, seq_d[seq_id])
#-------------------------------------------------------------------------------
def get_random_seqs(seq_d, nseq):
"""
Return an iterator of tuples for 'nseq' randomly selected sequences
"""
seqids_l = random.sample( seq_d.keys(), nseq)
return filter_by_id(seq_d, seqids_l)
#-------------------------------------------------------------------------------
def write_as_fas(seq_i):
"""
Write to stdout a fasta file with the selected seqs from the seq_ids LIST.
Each seq line got a maximum of 80 characters
"""
for seq_id, seq in seq_i:
print(">" + seq_id)
# To cut seq lines each 80 characters
while len(seq) > 80:
print(seq[:80])
seq = seq[80:]
print(seq)
#-------------------------------------------------------------------------------
def make_summary(seq_d, graphs_path=None):
"""
Print out a summary of nucleotide sequences statistics
"""
GCs_d = bns.get_all_GCs(seq_d)
lens_d = bns.get_all_lens(seq_d)
n_seq = len(seq_d)
N50 = bns.get_N50(seq_d)
# Stdout:
print("Total seqs:\t{0}".format(n_seq))
print("Total length (in nucl):\t{0}".format(sum(lens_d.values())))
print("Mean GC:\t{0:.2f}".format(sum(GCs_d.values()) / float(n_seq)))
print("Mean length:\t{0:.2f}".format(sum(lens_d.values()) / float(n_seq)))
print("Standard deviation:\t{0:.2f}".format(numpy.std(lens_d.values())))
print("Median length:\t{0}".format(numpy.median(lens_d.values())))
print("Min length:\t{0}".format(min(lens_d.values())))
print("Max length:\t{0}".format(max(lens_d.values())))
print("N50:\t{0}".format(N50))
print("Contigs in N50:\t{0}".format(len([x for x in lens_d.values() if x >= N50 ])))
# Graphs:
if graphs_path:
biog.plot_hist(lens_d, graphs_path + "Contig_lengths_histo.png" )
#-------------------------------------------------------------------------------