-
Notifications
You must be signed in to change notification settings - Fork 25
/
concatenate_bed.py
executable file
·152 lines (124 loc) · 4.85 KB
/
concatenate_bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
import os
import argparse
import sys
import re
import gzip
parser = argparse.ArgumentParser(description= """
DESCRIPTION:
Concatenate several bed files in a single one and adds a column identifying
for each line the source file.
An error is thrown if one tries to concatenate a file to itself.
This program concatenate any file, not just beds and no checking is done
about whether the files have bed formatting.
EXAMPLES:
Concatenate all bed files in current dir and strip the .bed extension
ls *.bed | concatenate_bed.py -i - -o concoutput.bed -s .bed
Output (last column is the file name):
--------------------------------------------------------------------------------
chr1 3521705 3521924 CpG:_27 2 52 219 0.2374429 bham359.coverage
chr1 3660700 3661155 CpG:_34 11 302 455 0.6637363 bham359.coverage
chr1 3661735 3662237 CpG:_45 10 311 502 0.6195219 bham359.coverage
...
--------------------------------------------------------------------------------
TODO:
""", formatter_class= argparse.RawTextHelpFormatter)
parser.add_argument('-i', '--input',
nargs='+',
type= str,
required= True,
help="""Input bed files(s) to be concatebated. Use -i - to
read the list of files from stdin (e.g. ls *.bed | concatenate_bed.py -i -)
""")
parser.add_argument('-o', '--output',
type= str,
required= True,
help="""Name of output file.
""")
parser.add_argument('-s', '--strip',
type= str,
nargs= '+',
required= False,
help="""List of regexs to strip from the input file name. Each of these
regexs will be removed. E.g.
-s \.merged\.bed$ \.bed$
will strip both .bed and .merged.bed. Note: Order matters put less specific regex first (that is .merged.bed bofore .bed)
The resulting name will be used as identifier of the source file. Default is not
to strip anything.
""")
parser.add_argument('-d', '--dir',
action= 'store_true',
required= False,
help="""With this flag the file id in output will include
the directory path. Defualt is to strip the path.
""")
parser.add_argument('--skip',
type= int,
default= 0,
required= False,
help="""Skip this many lines from each input bed before
writing out (e.g. use --skip 1 to skip the header).
""")
parser.add_argument('--fill',
type= str,
default= None,
required= False,
help="""If the concatenated bed files have different number of fields,
fill short rows with this string (e.g. --fill NA). Note: The file name will remain
the last column. Default is None meaning don't fill up rows
""")
#parser.add_argument('-H', '--header',
# type= str,
# required= False,
# help="""Optional string to use as header.
# """)
args = parser.parse_args()
if args.input == ['-']:
args.input= sys.stdin.readlines()
args.input= [x.strip() for x in args.input]
if args.output in args.input:
sys.exit('%s error: Output file %s also present in input list' %(os.path.basename(__file__), args.output))
#if args.header:
# print(args.header)
if args.output.endswith('.gz'):
fout= gzip.open(args.output, 'wb')
else:
fout= open(args.output, 'w')
ncols= 0 ## Keep track of the number of columns in eah line in order to fill in short rows
concbed= []
for f in args.input:
n= 0
file_id= f
if args.dir is False:
file_id= os.path.split(f)[1]
if args.strip:
for r in args.strip:
file_id= re.sub(r, '', file_id)
print('Concatenating: %s; File ID: %s' %(f, file_id))
if f.endswith('.gz'):
fin= gzip.open(f)
else:
fin= open(f)
for line in fin:
if n < args.skip:
n += 1
continue
line= line.rstrip('\n\r')
line= line.split('\t')
if len(line) > ncols:
ncols= len(line)
line.append(file_id)
concbed.append(line)
fin.close()
ncols= ncols + 1 ## +1 because file name has been appended to each row
if args.fill is not None:
" Memo: concbed is a list of list. Each inner list a bed row "
for i in range(0, len(concbed)):
line= concbed[i]
if len(line) < ncols:
fill= [args.fill] * (ncols - len(line))
line= line[:-1] + fill + [line[-1]]
concbed[i]= line
for line in concbed:
fout.write('\t'.join(line) + '\n')
fout.close()