-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusters_from_bam.py
executable file
·189 lines (139 loc) · 5.96 KB
/
clusters_from_bam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import sys
import os
import pysam
import numpy as np
import subprocess
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
from mylog import get_logger
logger = get_logger(__file__, __name__)
# ROOM FOR IMPROVEMENT...
# Sort isoform by abundance and named them iso_1 ... iso_n according
# to abundance
# Display name of the iso in the bam/fas file
# Force the usage of a set of exons by specifying a bed file in cmd line
def dedup_fasta(fasta, fasta_out=None):
"""
From a fasta file return a dictionary with the duplicate sequences
merged under a single entry (the name of the sequence will be the
concatenation of all the previous sequence names).
WARNING: Output dictionnary is as:
keys=sequences
values=sequence names
Produce a fasta outfile if specified
"""
dedup_dict = {}
for seq in pysam.FastxFile(fasta):
if seq.sequence in dedup_dict:
dedup_dict[seq.sequence].append(seq.name)
else:
dedup_dict[seq.sequence] = [seq.name]
reads_per_seq = Counter(map(lambda x: len(x), dedup_dict.values()))
logger.info('Number of copies per reads:')
logger.info(reads_per_seq)
# Plot log(x) + 1 to see count =1 and large values
plt.bar(reads_per_seq.keys(), np.log(reads_per_seq.values()) + 1)
plt.xlabel("Number of copies")
plt.ylabel("Log(Frequencies) + 1")
plt.savefig("Number_of_copies_per_unique_read.png")
plt.close()
# Create a fasta file with only unique sequences
if fasta_out:
with open(fasta_out, 'w') as f:
for seq in dedup_dict:
f.write('>' + '-'.join(dedup_dict[seq]) + '\n')
f.write(seq + '\n')
return dedup_dict
def get_putative_exons(bam):
"""
Get a bed file with putative exons from a bam file
"""
# Index the bam file
pysam.index(bam)
cmd = 'bedtools bamtobed -split -i {0} | bedtools sort -i - | bedtools merge -i - > {1}'.format(bam, config_potential_exon)
subprocess.call(cmd, shell=True)
def get_overall_exon_counts(bam):
"""
Get the total number of reads which are supporting each exons
"""
cmd = 'bedtools multicov -split -bams {0} -bed {1}'.format(bam, config_potential_exon)
out = subprocess.check_output(cmd, shell=True)
out = out.strip().split('\n')
out = [x.split('\t') for x in out]
logger.info("Number of exons: {}".format(len(out)))
logger.info("Exon coverage distribution: {}".format(out))
# xs = map(int, chain.from_iterable([i[1:3] for i in out]))
# ys = map(int, chain.from_iterable([[i[3], i[3]] for i in out]))
# plt.plot(xs, ys)
# plt.show()
def get_exon_count_per_read(bam):
"""
From a bam file with all reads aligned to the genome, generate one
bam file per read and counts per exons This might be unecessary
(this is done to produce the count matrix for clustering)
"""
sam = pysam.AlignmentFile(bam, 'rb')
fcount = 0
count_dict = {}
for rd in sam.fetch(until_eof=True):
bam_path = os.path.join(config_bam_out, str(fcount) + '.bam')
with pysam.AlignmentFile(bam_path, 'wb', template=sam) as bam_out:
bam_out.write(rd)
fcount += 1
logger.debug("Indexing {}".format(bam))
pysam.index(bam_path)
cmd = 'bedtools multicov -split -bams {0} -bed {1}'.format(bam_path, config_potential_exon)
out = subprocess.check_output(cmd, shell=True).strip('\n')
if rd in count_dict:
raise IOError('Duplicate read names detected')
count_dict[rd.query_name] = ''.join([x.split('\t')[3]
for x in out.strip().split('\n')])
with open(config_count_out, 'a') as f:
f.write(rd.query_name + '\t')
f.write('\t'.join(list(count_dict[rd.query_name])) + '\n')
logger.info('Total sequences put in single indexed bams: {}'.format(fcount))
return count_dict
def cluster_by_exon_composition(bam, count_dict):
"""
From a bam file with all reads aligned to the genome and a
dictionnary (k: read name, v: exon composition), get bams grouped
by exon composition
"""
exon_compo = set(count_dict.values())
for ex in exon_compo:
seq_in_clust = [k for k in count_dict if count_dict[k] == ex]
logger.info('For cluster {0}: {1} sequence(s)'.format(ex, len(seq_in_clust)))
bam_path = os.path.join(config_bam_clusters, 'cluster' + ex + '.bam')
fas_path = os.path.join(config_fas_clusters, 'cluster' + ex + '.fas')
sam = pysam.AlignmentFile(bam, 'rb')
with pysam.AlignmentFile(bam_path, 'wb', template=sam) as bam_out:
for rd in sam.fetch(until_eof=True):
if rd.query_name in seq_in_clust:
bam_out.write(rd)
with open(fas_path, 'a') as fas:
fas.write('>' + rd.query_name + '\n')
fas.write(rd.seq + '\n')
pysam.index(bam_path)
def prepare_outfolder():
logger.info('Creating output folders in {}'.format(config_outfolder))
os.makedirs(config_outfolder)
os.makedirs(config_bam_clusters)
os.makedirs(config_fas_clusters)
os.makedirs(config_bam_out)
if __name__ == '__main__':
# Deduplicate a fasta file
# dedup_fasta(sys.argv[1], sys.argv[2])
config_bam = sys.argv[1]
config_outfolder = sys.argv[2]
# Path for outfile potential exons
config_potential_exon = os.path.join(config_outfolder, 'potential_exons.bed')
config_bam_out = os.path.join(config_outfolder, 'bam_tmp')
config_bam_clusters = os.path.join(config_outfolder, 'bam_clusters')
config_fas_clusters = os.path.join(config_outfolder, 'fas_clusters')
config_count_out = os.path.join(config_outfolder, 'counts')
prepare_outfolder()
get_putative_exons(config_bam)
get_overall_exon_counts(config_bam)
count_dict = get_exon_count_per_read(config_bam)
cluster_by_exon_composition(config_bam, count_dict)