-
Notifications
You must be signed in to change notification settings - Fork 55
/
oliver.py
475 lines (430 loc) · 18.9 KB
/
oliver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
"""
Methods for identifying invariant genes (suitable reference/
normalization genes) from expression data
Date created: 16th April 2013
@see: Chan, OYW, Keng, BMH, Ling, MHT. 2014. Correlation and Variation
Based Method for Reference Genes Identification from Large Datasets.
Electronic Physician 6(1): 719-727.
License: GNU General Public License version 3 for academic or
not-for-profit use only
Licence: GNU General Public License version 3
Bactome package is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sys
import time
import math
import sets
from invariant_gene import selfed_correlation
from invariant_gene import selfed_ratio_correlation
from invariant_gene import selfed_product_correlation
from invariant_gene import regression_ratio
from invariant_gene import average_stdev
from invariant_gene import cv
from invariant_gene import gradient
def datafile(filename):
'''
Reads input data file (containing probe and expression data) into a
dictionary.
Input data file is a comma-delimited file in the format of
ProbeName1,Sample1Value,Sample2value,Sample3Value, ...
ProbeName2,Sample1Value,Sample2value,Sample3Value, ...
ProbeName3,Sample1Value,Sample2value,Sample3Value, ...
@param filename: input data file name
@return: dictionary where key is ProbeName and value is a list of
SampleValues
'''
print 'Reading data from', filename
t = time.time()
f = open(filename, 'r').readlines()
if len(f) == 1:
f = f[0].split('\r')
f = [x[:-1] for x in f]
f = [x.split(',') for x in f]
data = {}
for x in f:
try:
x = [y for y in x if y != '']
if len(x) > 2:
name = str(x[0])
values = [float(y) for y in x[1:]]
data[name] = values
except: 'Error formatting:', x
print ' Time taken:', time.time()-t, 'seconds'
return data
def p_gradient(data, options, methods, results):
'''
Processor for reference gene identification method: gradient
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating gradients ...'
t = time.time()
t_results = gradient(data)
methods.append('gradient')
for genename in t_results:
if results.has_key(genename):
results[genename]['gradient'] = t_results[genename]
else:
results[genename] = {'gradient': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_cv(data, options, methods, results):
'''
Processor for reference gene identification method: coefficient of
variation
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating coefficient of variations ...'
t = time.time()
t_results = cv(data)
methods.append('cv')
for genename in t_results:
if results.has_key(genename):
results[genename]['cv'] = t_results[genename]
else:
results[genename] = {'cv': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_regression_ratio(data, options, methods, results):
'''
Processor for reference gene identification method: ratio of
coefficient of determination (R^2) and gradient
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating regression ratios (R^2/slope) ...'
t = time.time()
t_results = regression_ratio(data)
methods.append('R^2/slope')
for genename in t_results:
if results.has_key(genename):
results[genename]['R^2/slope'] = t_results[genename]
else:
results[genename] = {'R^2/slope': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_average_stdev(data, options, methods, results):
'''
Processor for reference gene identification method: product of average
and standard deviation
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating average x stdev ...'
t = time.time()
t_results = average_stdev(data)
methods.append('avg*SD')
for genename in t_results:
if results.has_key(genename):
results[genename]['avg*SD'] = t_results[genename]
else:
results[genename] = {'avg*SD': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_selfed_product_correlation(data, options, methods, results):
'''
Processor for reference gene identification method: average
absolute pairwise correlation between dataset X and the
product of X and sample of non-X data (n = randomsize)
where large value represents higher stability
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating selfed product correlation ...'
t = time.time()
if not options.has_key('rss'): options['rss'] = 30
t_results = selfed_product_correlation(data, options['rss'])
methods.append('prod_corr')
for genename in t_results:
if results.has_key(genename):
results[genename]['prod_corr'] = t_results[genename]
else:
results[genename] = {'prod_corr': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_selfed_ratio_correlation(data, options, methods, results):
'''
Processor for reference gene identification method: average
absolute pairwise correlation between dataset X and the
quotient of X and sample of non-X data (n = randomsize)
where small value represents higher stability
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating selfed ratio correlation ...'
t = time.time()
if not options.has_key('rss'): options['rss'] = 30
t_results = selfed_ratio_correlation(data, options['rss'])
methods.append('ratio_corr')
for genename in t_results:
if results.has_key(genename):
results[genename]['ratio_corr'] = t_results[genename]
else:
results[genename] = {'ratio_corr': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_selfed_correlation(data, options, methods, results):
'''
Processor for reference gene identification method: average
absolute pairwise correlation between dataset X and sample
non-X data (n = randomsize) where small value represents
higher stability
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating selfed correlation ...'
t = time.time()
if not options.has_key('rss'): options['rss'] = 30
t_results = selfed_correlation(data, options['rss'])
methods.append('self_corr')
for genename in t_results:
if results.has_key(genename):
results[genename]['self_corr'] = t_results[genename]
else:
results[genename] = {'self_corr': t_results[genename]}
print ' Time taken:', time.time()-t, 'seconds'
return (methods, results)
def p_geomean_expratio_cv(data, options, methods, results):
'''
Processor for reference gene identification method: geometric mean
of exponent of ratio correlation (e^ratio) and coefficient of variation
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating geometric mean of ratio correlation and CV ...'
(methods, results) = p_selfed_ratio_correlation(data, options,
methods, results)
(methods, results) = p_cv(data, options, methods, results)
methods.append('geomean_expratio_cv')
for genename in results.keys():
eratio = math.e ** results[genename]['ratio_corr']
cv = results[genename]['cv']
geomean = math.sqrt(eratio * cv)
if results.has_key(genename):
results[genename]['geomean_expratio_cv'] = geomean
else:
results[genename] = {'geomean_expratio_cv': geomean}
return (methods, results)
def p_avgexpratio_avgcv(data, options, methods, results):
'''
Processor for reference gene identification method:
(e^ratio_correlation / average of e^ratio_correlation) +
(cv / average of cv)
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@param methods: list of methods used (current method will be appended
for final result file)
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: tuple of (methods, results)
'''
print 'Calculating sum of averaged ratio correlation and averaged CV ...'
(methods, results) = p_selfed_ratio_correlation(data, options,
methods, results)
(methods, results) = p_cv(data, options, methods, results)
methods.append('avgexpratio_avgcv')
avgexpratio = [math.e ** results[genename]['ratio_corr']
for genename in results.keys()]
avgexpratio = sum(avgexpratio) / float(len(avgexpratio))
avgcv = [results[genename]['cv']
for genename in results.keys()]
avgcv = sum(avgcv) / float(len(avgcv))
for genename in results.keys():
eratio = math.e ** results[genename]['ratio_corr']
cv = results[genename]['cv']
r = (float(eratio) / avgexpratio) + (float(cv) / avgcv)
if results.has_key(genename):
results[genename]['avgexpratio_avgcv'] = r
else:
results[genename] = {'avgexpratio_avgcv': r}
return (methods, results)
def processor(data, options):
'''
Main processor loop to execute each required reference gene
identification method processors
@param data: dictionary of input data from datafile function
@param options: options for current method (please see user manual)
@return: tuple of (methods, results) where methods is list of methods
used, and results is a dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
'''
results = {}
methods = []
if options.has_key('all_methods') or options.has_key('gradient'):
# Method: gradient
(methods, results) = p_gradient(data, options, methods, results)
if options.has_key('all_methods') or options.has_key('cv'):
# Method: CV
(methods, results) = p_cv(data, options, methods, results)
if options.has_key('all_methods') or options.has_key('regression'):
# Method: regression ratios (R^2/slope)
(methods, results) = p_regression_ratio(data, options,
methods, results)
if options.has_key('all_methods') or options.has_key('avgstd'):
# Method: average x stdev
(methods, results) = p_average_stdev(data, options,
methods, results)
if options.has_key('all_methods') or options.has_key('pcorr'):
# Method: selfed product correlation
(methods, results) = p_selfed_product_correlation(data, options,
methods, results)
if options.has_key('all_methods') or options.has_key('rcorr'):
# Method: selfed ratio correlation
(methods, results) = p_selfed_ratio_correlation(data, options,
methods, results)
if options.has_key('all_methods') or options.has_key('scorr'):
# Method: selfed correlation
(methods, results) = p_selfed_correlation(data, options,
methods, results)
# Method: geomean(e^ratio, CV)
(methods, results) = p_geomean_expratio_cv(data, options,
methods, results)
# Method: (e^ratio/avg(e^ratio)) + (cv/avg(cv))
(methods, results) = p_avgexpratio_avgcv(data, options,
methods, results)
return (methods, results)
def results_writer(filename, methods, results):
'''
Function to write out analysis results.
@param filename: name of output (results) file
@param methods: list of methods used
@param results: results dictionary in the format of {<ProbeName>:
{<method>: <results from method>}}
@return: none
'''
out = open(filename, 'w')
out.write('ResultFile,' + ','.join(methods) + '\n')
for genename in results:
values = ','.join([str(results[genename][m])
for m in methods])
out.write(','.join([genename, values]) + '\n')
out.close()
def option_processor(argv):
'''
Processor for options from command line into a dictionary (for example,
-rss:30 --> {'rss': 30})
'''
options = {}
options['application'] = argv[0]
argv = [x[1:] for x in argv[1:] if x.startswith('-')]
for arg in argv:
arg = arg.split(':')
if len(arg) == 2:
options[arg[0]] = arg[1]
else:
options[arg[0]] = ''
return options
def print_usage():
'''
Prints usage statement
'''
print '''
OLIgonucleotide Variable Expression Ranker (OLIVER) 1.0:
A tool for identifying suitable reference (invariant) genes from
large transcriptomme datasets.
Date created: 16th April 2013
License: GNU General Public License version 3 for academic or not-
for-profit use only
Usage: python oliver.py <expression filename> <results filename> [options]
where
<expression filename> is a comma-delimited file in the
format of
ProbeName1,Sample1Value,Sample2value,Sample3Value, ...
ProbeName2,Sample1Value,Sample2value,Sample3Value, ...
ProbeName3,Sample1Value,Sample2value,Sample3Value, ...
...
[options] in the format of -<option key>:<option value>
For example, -outfmt:5 (option with value) and -fmt
(option without value)
Allowed options (please refer to user manual for detailed description):
-all_methods Calculate for all available methods of reference genes
identification
-avgstd Calculates average x standard deviation
-cv Calculates coefficient of variation
-gradient Calculates gradient
-help Prints / displays usage message
-regression Calculates regression ratio (R^2/slope)
-pcorr Calculates selfed product correlation
-rcorr Calculates selfed ratio correlation
-rss Define the number of random samples for pair-wise
correlations. If not given, the default is 30.
-scorr Calculates selfed correlation
'''
if __name__=='__main__':
if len(sys.argv) < 3:
print print_usage()
else:
print '''
OLIgonucleotide Variable Expression Ranker (OLIVER) 1.0:
A tool for identifying suitable reference (invariant) genes from
large transcriptomme datasets.
Date created: 16th April 2013
License: GNU General Public License version 3 for academic or
not-for-profit use only
'''
options = option_processor(sys.argv)
if options.has_key('help'):
print print_usage()
sys.exit(0)
data = datafile(sys.argv[1])
(methods, results) = processor(data, options)
methods = list(sets.Set(methods))
results_writer(sys.argv[2], methods, results)
print
print 'Summary ......'
print 'Input data file:', sys.argv[1]
print 'Number of genes/probes in input data file:', len(data)
print 'Output results file:', sys.argv[2]
print 'Number of methods used:', len(methods)
print 'Methods:', ', '.join(methods)
print '===== end of summary ====='