-
Notifications
You must be signed in to change notification settings - Fork 55
/
gc.py
199 lines (170 loc) · 7.52 KB
/
gc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
'''
Percent GC Calculator
Date created: 20th April 2018
License: GNU General Public License version 3 for academic or
not-for-profit use only
Bactome package is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
import random
import fire
from bactome_utils import sequenceSelector
from genbank import recordSelector
def sliderGC(gbfile, RecordIndex=0, RecordID=None, start=0,
end=-1, window=10000, interval=1):
'''!
Function to calculate %GC of a specific sequence in Genbank
file using a sliding-window method.
Usage:
python gc.py slidegc --gbfile=<Genbank file path>
--RecordIndex=<record number> --RecordID=<Genbank record name>
--start=<starting base position> --end=<ending base position>
--window=<window size> --interval=<interval size>
where
- record number is the relative Genbank record number in
the given Genbank file where 0 is the first record and
1 is the secord record, and so on
- Genbank record name is the identifying ID for the Genbank
record
- starting and ending base positions define the boundary of
analysis (region of interest). If entire chromosome is of
interest, starting and ending base positions will be 0 and
-1 respectively
- window size is the window to calculate. For example, if
window size is 1000, %GC will be calculated as ratio of GC
per 1000 bases
- interval refers to be magnitude of slide
Either RecordIndex or RecordID is required.
@param gbfile string: file path of the Genbank file
@param RecordIndex Integer: Relative Genbank record number in
the given Genbank file where 0 is the first record and 1 is
the secord record, and so on
@param RecordID String: Unique name (identifying IDs) for the
specific Genbank record
'''
sequence = recordSelector(gbfile, RecordIndex, RecordID,
'sequence')
sequence = sequenceSelector(sequence, start, end)
window = int(window)
interval = int(interval)
pointer = 1
pdata = []
while (pointer - 1 + window) < len(sequence):
s = sequence[pointer-1:pointer-1+window]
gc = float(s.count('G') + s.count('C')) * 100/window
print('%i : %i : %.4f' % (pointer+start,
pointer-1+start+window,
gc))
pointer = pointer + interval
def blockGC(gbfile, RecordIndex=0, RecordID=None, start=0, end=-1,
window=10000):
'''!
Function to calculate %GC by blocks of a specific sequence in
Genbank file.
Usage:
python gc.py blockgc --gbfile=<Genbank file path>
--RecordIndex=<record number> --RecordID=<Genbank record name>
--start=<starting base position> --end=<ending base position>
--window=<window size>
where
- record number is the relative Genbank record number in
the given Genbank file where 0 is the first record and
1 is the secord record, and so on
- Genbank record name is the identifying ID for the Genbank
record
- starting and ending base positions define the boundary of
analysis (region of interest). If entire chromosome is of
interest, starting and ending base positions will be 0 and
-1 respectively
- window size is the window to calculate. For example, if
window size is 1000, %GC will be calculated as ratio of GC
per 1000 bases
Either RecordIndex or RecordID is required.
@param gbfile string: file path of the Genbank file
@param RecordIndex Integer: Relative Genbank record number in
the given Genbank file where 0 is the first record and 1 is
the secord record, and so on
@param RecordID String: Unique name (identifying IDs) for the
specific Genbank record
'''
sequence = recordSelector(gbfile, RecordIndex, RecordID,
'sequence')
sequence = sequenceSelector(sequence, start, end)
window = int(window)
pointer = 1
pdata = []
while (pointer - 1 + window) < len(sequence):
s = sequence[pointer-1:pointer-1+window]
gc = float(s.count('G') + s.count('C')) * 100/window
print('%i : %i : %.4f' % (pointer+start,
pointer-1+start+window,
gc))
pointer = pointer + window
def randomize(gbfile, RecordIndex=0, RecordID=None, start=0, end=-1,
window=10000, n=5000):
'''!
Function to generate random sequence samples from a specific
sequence in Genbank file and calculate the %GC of each randomized
sequence.
Usage:
python gc.py random --gbfile=<Genbank file path>
--RecordIndex=<record number> --RecordID=<Genbank record name>
--start=<starting base position> --end=<ending base position>
--window=<window size> --n=<sample size>
where
- record number is the relative Genbank record number in
the given Genbank file where 0 is the first record and
1 is the secord record, and so on
- Genbank record name is the identifying ID for the Genbank
record
- starting and ending base positions define the boundary of
analysis (region of interest). If entire chromosome is of
interest, starting and ending base positions will be 0 and
-1 respectively
- window is the window size to calculate. For example, if
window size is 1000, %GC will be calculated as ratio of GC
per 1000 bases
- sample size is the number of randomized samples to generate
Either RecordIndex or RecordID is required.
@param gbfile string: file path of the Genbank file
@param RecordIndex Integer: Relative Genbank record number in
the given Genbank file where 0 is the first record and 1 is
the secord record, and so on
@param RecordID String: Unique name (identifying IDs) for the
specific Genbank record
'''
sequence = recordSelector(gbfile, RecordIndex, RecordID,
'sequence')
sequence = sequenceSelector(sequence, start, end)
choices = list(range(len(sequence)))
window = int(window)
count = 0
data = []
pdata = []
while count < int(n):
index = [random.choice(choices) for i in range(window)]
s = [sequence[i] for i in index]
s = ''.join(s)
gc = float(s.count('G') + s.count('C')) * 100/window
data.append(gc)
print('%i : %.4f' % (count+1, gc))
count = count + 1
print()
mean_gc = sum(data) / len(data)
stdev_gc = (sum([(i-mean_gc)**2 for i in data]) / \
(len(data) - 1)) ** 0.5
print('Average Percent GC: %.4f' % mean_gc)
print('Standard Deviation Percent GC: %.5f' % stdev_gc)
if __name__ == '__main__':
exposed_functions = {'slidegc': sliderGC,
'blockgc': blockGC,
'random': randomize}
fire.Fire(exposed_functions)