forked from AnimaTardeb/G4Hunter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathg4_predictor.py
212 lines (191 loc) · 10.1 KB
/
g4_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Guanine_quadruplex_predictor.py -i <inputFasta> -o <outputDir> -w Window_width -s Threshold_score
Author: J Sietsma Penington
Date: 21 April 2017
Implementation of G4Hunter algorithm from
Re-evaluation of G-quadruplex propensity with G4Hunter
Amina Bedrat Laurent Lacroix Jean-Louis Mergny
Nucleic Acids Res (2016) 44 (4): 1746-1759.
DOI: https://doi.org/10.1093/nar/gkw006
Input Fasta-format file of sequences is scanned for regions with high 'G-richness' +
'G-skewness', implemented as a score based on the number of consecutive G nucleotides.
To simultaneously assess the complementary strand the same score is calculated as a
negative number for consecutive C nucleotides. Skewness is implemented by adding the
positive G scores and negative C scores so that regions with balanced G and C numbers have
small scores.
Two output files are written, one containing all sliding windows with absolute value of
mean score above the given threshold, and one containing extended regions of merged
windows which pass the criterion.
Intervals are written in 0-based, half open format, (]. 'start' is the position
before the interval and 'end' is the last position in the interval.
For example the first nucleotide would be described:
start end length
0 1 1
Two decisions that may need changing:
1. A merged region of adjacent windows has the score for the region calculated, but
is not filtered by the final score. It is possible a number of windows with
scores above threshold may combine to a region with mean score below threshold, which
would still be written to the output file. This is same as Bedrat-Lacroix-Mergny
output, but may be undesirable.
2. Score is recalculated for each single-nucleotide advance of the window.
It would probably be more efficient to calculate the score for the first window, then
add and subtract scores for just the first and last nucleotides as the window is
advanced.
"""
import sys
import argparse
import os
from Bio import SeqIO
from Bio.Seq import Seq
def ScoreSeq(seq, winStart=0, winEnd=-1):
if winEnd < 0:
winEnd = len(seq)
# calculate the total score for a window in the supplied sequence.
# Default is window=sequence. If seq is larger, leading and trailing nucleotides are
# used to adjust the score for the window if they extend a run.
priorGcount = priorCcount = GbeforeWindow = CbeforeWindow = score = 0
if winStart > 0: # there are leading nucleotides. Increment length of run
i = winStart - 1
if seq[winStart] == "G":
while i >= 0 and seq[i] == "G" and GbeforeWindow < 4:
GbeforeWindow += 1
i -= 1
elif seq[winStart] == "C":
while i >= 0 and seq[i] == "C" and CbeforeWindow < 4:
CbeforeWindow += 1
i -= 1
# this corrects for the fact that in the main algorithm the prior counts are used to
# adjust score for whole run, not just nucleotide under consideration
for i in range(winStart, winEnd):
if seq[i] == "G":
if priorGcount + GbeforeWindow < 4:
score += priorGcount*2 + 1 + GbeforeWindow
priorGcount += 1
else: # if run reaches GGGG, each subsequent G adds 4 regardless of number
score += 4
priorCcount = 0
CbeforeWindow = 0 # any C run is terminated
elif seq[i] == "C":
if priorCcount + CbeforeWindow < 4:
score -= priorCcount*2 + 1 + CbeforeWindow
priorCcount += 1
else: # if run reaches CCCC, each subsequent C subtracts 4 regardless of number
score -= 4
priorGcount = 0
GbeforeWindow = 0 # any G run is terminated
else: # nucleotide is A, T or ambiguous: no change to score, any run is terminated
priorGcount = priorCcount = GbeforeWindow = CbeforeWindow = 0
if winEnd < len(seq): # there are trailing nucleotides. Look ahead to adjust score
i = winEnd
if seq[winEnd - 1] == "G":
runlength = priorGcount
while i < len(seq) and seq[i] == "G" and runlength < 4:
score += priorGcount
runlength += 1
i += 1
if seq[winEnd - 1] == "C":
runlength = priorCcount
while i < len(seq) and seq[i] == "C" and runlength < 4:
score -= priorCcount
runlength += 1
i += 1
meanScore = float(score) / (winEnd - winStart)
return meanScore
def G4predictor(inFasta, out_win_file, out_region_file, win_width, thresh):
regionCount = 0
seqRecordCount = 0
for seq_record in SeqIO.parse(inFasta, "fasta"):
seqRecordCount += 1
out_win_file.write(seq_record.id + '\n')
out_win_file.write("Start\tEnd\tSequence\tLength\tScore\n")
out_region_file.write(seq_record.id + '\n')
seq_curr = seq_record.upper()
prevWinPass = False
G4predCount = predG4start = predG4end = 0
predG4seq = ""
# Slide window along sequence, advance 1 nucleotide each loop:
for i in range(len(seq_curr) - win_width + 1):
win_curr = seq_curr[i : i + win_width]
# where possible calculate score using 3 leading and 3 trailing bases, to get full value for bases in runs
extended_win = seq_curr.seq[max(0, i-3) : min(i + win_width +3, len(seq_curr)) ]
score_curr = ScoreSeq(extended_win, min(3, i), min(win_width+3, len(extended_win)))
if abs(score_curr) > thresh :
out_win_file.write('{0}\t{1}\t{2}\t{3}\t{4:.2f}\n'.format(
i, i+win_width, win_curr.seq, win_width, score_curr))
if i > predG4end:
# start of new predicted G4 region : write any previous region to file
if G4predCount == 0:
out_region_file.write("Start\tEnd\tSequence\tLength\tScore\tNumber\n")
else:
out_region_file.write('{0}\t{1}\t{2}\t{3}\t{4:.2f}\t{5}\n'.format(
predG4start, predG4end, predG4seq,
len(predG4seq), ScoreSeq(predG4seq), G4predCount) )
G4predCount += 1
# Adjust start of new region:
if win_curr[0] in ["C", "G"]:
# extend backwards to prior Gs or Cs if they match
predG4seq = win_curr.seq
predG4start = i
while predG4start > 0 and predG4seq[0] == seq_curr.seq[predG4start-1]:
predG4seq = seq_curr.seq[predG4start-1] + predG4seq
predG4start -= 1
else: # strip off leading A and T nucleotides
predG4seq = win_curr.seq.lstrip("AT")
predG4start = i + win_width - len(predG4seq)
else: # current window extends current G4 region
if prevWinPass: # simple case of adjacent windows
predG4seq = predG4seq + win_curr.seq[-1]
else:
predG4seq = seq_curr[predG4start:i + win_width].seq
predG4end = i + win_width
prevWinPass = True
else:
if prevWinPass:
# a run of 1 or more windows with scores above threshold has just ended
if predG4seq[-1] in ["C", "G"]:
# extend forward to next base if it matches
if predG4seq[-1] == win_curr.seq[-1]:
predG4seq = predG4seq + win_curr.seq[-1]
else: # strip off trailing A and T nucleotides
predG4seq = predG4seq.rstrip("AT")
predG4end = predG4start + len(predG4seq)
prevWinPass = False
# Adjust end of, and write, last region
if G4predCount == 0:
out_region_file.write("# No predicted quadruplex regions found\n")
else:
while (predG4end < len(seq_curr)-1 and predG4seq[-1] in ["C", "G"] and
predG4seq[-1] == seq_curr.seq[predG4end+1]):
predG4seq = predG4seq + seq_curr.seq[predG4end+1]
predG4end += 1
predG4seq = predG4seq.rstrip("AT")
predG4end = predG4start + len(predG4seq)
out_region_file.write('{0}\t{1}\t{2}\t{3}\t{4:.2f}\t{5}\n'.format(
predG4start, predG4end, predG4seq, len(predG4seq),
ScoreSeq(predG4seq), G4predCount) )
regionCount += G4predCount
print "Number of sequences:", seqRecordCount
print "Number of potential quadruplex regions from given parameters:", regionCount
if __name__=="__main__":
parser = argparse.ArgumentParser(description=
'Search a FASTA file for regions likely to form G4 quadruplexes')
parser.add_argument('-i', dest='fasta_name', metavar='<infile>',
help='the input fasta file')
parser.add_argument('-o', dest='out_dir', metavar='<outdir>',
help='directory for output files')
parser.add_argument('-w', dest='window_width', type=int,
help= 'Width of sliding window. Recommended value 25 nts')
parser.add_argument('-s', dest='threshold', type=float,
help= 'Threshold for absolute value of score to predict a quadruplex region. Typical value 1.0 to 1.5')
args = parser.parse_args()
basefname = os.path.basename(args.fasta_name).split('.')[0]
outDirName = os.path.join(args.out_dir, "g4Results_" + str(args.threshold))
os.mkdir(outDirName)
outWinName = os.path.join(outDirName,
basefname + "_" + str(args.window_width) + "nts.tsv")
outputWindows = open(outWinName, 'w')
outRegionName = os.path.join(outDirName,
basefname + "_merged.tsv")
outputRegions = open(outRegionName, 'w')
print "Input file:", os.path.basename(args.fasta_name)
G4predictor(args.fasta_name, outputWindows, outputRegions, args.window_width, args.threshold)