forked from xuchang116/smCounter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ds.reads.withinMT.py
114 lines (93 loc) · 3.41 KB
/
ds.reads.withinMT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
# vim: tabstop=9 expandtab shiftwidth=3 softtabstop=3
# Downsample reads, while maintaining MT count
# Chang Xu. 23NOV2015
import os
import sys
import datetime
import subprocess
import shutil
import time
import math
import pysam
from collections import defaultdict
import argparse
import random
#--------------------------------------------------------------------------------------
# main function
#--------------------------------------------------------------------------------------
def main(args):
random.seed(args.seed)
# change working directory to runDir and make output directories
os.chdir(args.runPath)
timeStart = datetime.datetime.now()
#print("started at " + str(timeStart))
bcDict = defaultdict(list)
samfile = pysam.AlignmentFile(args.inBam, "rb")
dsBam = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
selectedReads = set()
droppedReads = set()
#print('reading in bam file')
for read in samfile.fetch():
# read ID
qname = read.query_name
# Barcode
BC = qname.strip().split(':')[-2]
if qname not in bcDict[BC]:
bcDict[BC].append(qname)
# Count the number of one-read MT and multi-read MT
#print('Counting MTs')
oneReadMtCnt = 0
multiReadMtCnt = 0
multiReadMtReadCnt = 0
for bc in bcDict.values():
if len(bc) == 1:
oneReadMtCnt += 1
else:
multiReadMtCnt += 1
multiReadMtReadCnt += len(bc)
# calculate probability to keep reads in multi-read MT, after keeping 1 read
probKeep = 1.0 * (args.rpb - 1.0) * (oneReadMtCnt + multiReadMtCnt) / (multiReadMtReadCnt - multiReadMtCnt)
# select or drop the reads
#print('Random sampling')
for bc in bcDict.values():
# always keep one read MT
if len(bc) == 1:
selectedReads.add(bc[0])
# keep the first read in multi-read MT, then random sample
else:
selectedReads.add(bc[0])
for rid in bc[1:]:
r = random.random()
if r <= probKeep:
selectedReads.add(rid)
# write to file
#print('Writing to BAM file')
for read in samfile.fetch():
# read ID
qname = read.query_name
if qname in selectedReads:
dsBam.write(read)
samfile.close()
dsBam.close()
# log run completion
timeEnd = datetime.datetime.now()
#print("completed running at " + str(timeEnd) + "\n")
#print("total time: "+ str(timeEnd-timeStart) + "\n")
#--------------------------------------------------------------------------------------
# Run
#--------------------------------------------------------------------------------------
def run():
parser = argparse.ArgumentParser(description='Downsample MTs')
parser.add_argument('--runPath', default=None, help='path to working directory')
parser.add_argument('--inBam', default=None, help='Input BAM file')
parser.add_argument('--outBam', default=None, help='Output BAM file')
parser.add_argument('--rpb', type=float, default=1.0, help='target reads per MT')
parser.add_argument('--seed', type=int, default=1234567, help='Seed for random number generation')
args = parser.parse_args()
main(args)
#----------------------------------------------------------------------------------------------
#pythonism to run from the command line
#----------------------------------------------------------------------------------------------
if __name__ == "__main__":
run()