-
Notifications
You must be signed in to change notification settings - Fork 4
/
RedDog_config_massive.py
executable file
·595 lines (520 loc) · 18.8 KB
/
RedDog_config_massive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
'''
Configuration file for RedDog.py V1beta.10.4
-------------------------------
Copyright (c) 2016 David Edwards, Bernie Pope, Kat Holt
All rights reserved. (see README.txt for more details)
Essential pipeline variables.
'''
no_check = True
reference = ""
sequences = ""
output = ""
out_merge_target = ""
'''
force_tree and force_no_tree
The pipeline can produce a FastTree. If there are more than 500 isolates, the tree generation
will be switched off. If you want a tree for larger data sets, set force_tree to 'True'.
If you want to turnoff the tree entirely, set force_no_tree to 'True'.
(this overrides force_tree)
'''
force_tree = False
#force_tree = True
force_no_tree = False
#force_no_tree = True
'''
Notes:
'no_check' is for switching off the user check at the start of the run.
This enables the pipeline to be run as a single job (with lots of cpus) on a distributed system.
IMPORTANT: DO NOT CHANGE SETTINGS FOR A 'MERGE' RUN: RedDog still checks the previous settings
against those in the config file and will ask for confirmation of changes.
If you are unsure of your previous settings, check the 'Run Report'.
'reference' and 'sequences'
Reference can be GenBank or fasta format: if GenBank format, this will be converted
to a fasta version for mapping.
If the GenBank file is not given, the gene cover and depth matrices for genes
will not be generated and nor will SNP consequences.
If you don't have the GenBank record, or don't want the above matrices
to be generated, enter a fasta format reference instead.
Ambiguous calls in the reference will cause problems for your output:
check that the sequence uses only A, C, G, T or N (upper or lower case).
RedDog will not run if it finds ambiguous calls (v1b11).
'''
#Test Sets
#reference = "/full_path_to/pipeline_test_sets/reference/NC_007384.gbk"
#reference = "/full_path_to/pipeline_test_sets/reference/NC_007384_with_plasmid.gbk"
#reference = "/full_path_to/pipeline_test_sets/reference/NC_007384_with_plasmid.fasta"
#sequences = "/full_path_to/pipeline_test_sets/*.fastq.gz"
#sequences = "/full_path_to/pipeline_test_sets/extra/*.fastq.gz"
# You can now also combine sequences from different folders into the same run...
#sequences = ["/full_path_to/pipeline_test_sets/*.fastq.gz", "/full_path_to/pipeline_test_sets/extra/*.fastq.gz"]
'''
'output' directory:
full path name including final "/"
For large data sets run the output to the scratch disk area and save the final output to
your shared directory (or contagion if you have access)
e.g. output = "/scratch/VR0082/a_folder/<ref>_<version>_<date>/"
'''
#output = "/full_path_to/<your_directory>/RedDog_output/<ref>_<version>_<date>/"
'''
Directory to merge output with ('out_merge_target'):
When running new analysis set to null string.
Otherwise set to the directory you want to merge with.
You can only merge a prior run with a new run (not two prior runs)
This merge target folder must have the bams and indexes in one
sub-folder (/bam) and the vcfs in another (/vcf). There also must
be a sequence_list.txt file.
The 'output' folder (see above) for a merge run should NOT exist prior to the run,
and will be deleted at completion of the pipeline.
Set to empty string for no merging (i.e. new run).
'''
#out_merge_target = ""
#out_merge_target = "/full_path_to/<your_directory>/RedDog_output/<ref>_<version>_<date>/"
'''
If none of the following are set or changed, the default settings will be used.
###############################################################################
Choose type of input sequences: IT for ion torrent (single reads),
PE for Illumina pair-end reads
or SE for Illumina single-end reads
readType = "PE" or "SE" or "IT"
Each read type has a particular pattern you need to follow for use in the pipeline:
SE: *.fastq.gz (i.e. must be gzipped)
PE: *_1.fastq.gz and *_2.fastq.gz (i.e. must be gzipped with forward and reverse in separate files)
IT: *_in.iontor.fastq.gz (i.e. must be gzipped)
'''
readType = "PE"
#readType = "SE"
#readType = "IT"
'''
Run Type: if set to a null string, the number of replicons in the reference will determine
the run type:
1 - 100 replicons (e.g. reference genome + plasmids + phage) - phylogeny run type
> 100 replicons (e.g. multifasta pangenome) - pangenome run type
The user can override the run type, by setting it below. The run types are described in
more detail in the instructions.
'''
runType = ""
#runType = "pangenome"
#runType = "phylogeny"
'''
For a pangenome run, the SNPs will only be called for the largest replicon: this assumes
the core genome is in this replicon. The user can define an alternative replicon
(or replicons) for the SNP calling.
Note: there must be a space after any comma.
Set to null string to get the largest contig
'''
core_replicon = ""
#core_replicon = "NC_007384, NC_007385"
#core_replicon = "AM412236_4_168118-212711"
#core_replicon = "AM412236_4_168118-212711, ParatyphiA_AKU1"
'''
Mapping To Use:
For Illumina reads, both BWA sampe/samse and Bowtie2 are now available.
If you don't set the mapping, the default is Bowtie mapping
(depending on whether you have pair-ended or single reads).
For Ion Torrent reads, the default is Bowtie2.
Illumina
PE SE IT
BWA sampe Y N N
BWA samse N Y N
Bowtie2 Y Y Y
Note: You cannot use two mappers at the same time!
'''
#mapping = "bwa"
mapping = "bowtie"
'''
Bowtie2 Mapping:
You can change the preset mapping options used with Bowtie2
with the 'bowtie_map_type' option.
Preset options in --end-to-end mode
--very-fast
--fast
--sensitive
--very-sensitive
Preset options in --local mode
--very-fast-local
--fast-local
--sensitive-local
--very-sensitive-local
default: if bowtie_map_type is set to ""
sensitive-local mapping will be used.
See the Bowtie2 Manual for more information on the presets
http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
'''
#end-to-end mode
#bowtie_map_type = "--very-fast"
#bowtie_map_type = "--fast"
#bowtie_map_type = "--sensitive"
#bowtie_map_type = "--very-sensitive"
#local mode
#bowtie_map_type = "--very-fast-local"
#bowtie_map_type = "--fast-local"
bowtie_map_type = "--sensitive-local"
#bowtie_map_type = "--very-sensitive-local"
'''
The default maximum length for bowtie2 to consider pair-ended reads contiguous
is 2000: you can change this with bowtie_X_value
'''
bowtie_X_value = 2000
'''
BCFtools SNP calling
consensus caller ["c"] (original) or multiallelic caller ["m"] (new bcftools v1+)
Note: multiallelic caller is used only for calling unique SNPs from the BAM files.
The consensus sequences used to populate the allele table based on these SNPs are
still generated using the original consensus caller: this will be changed if/when
a vcf2fq program is available for multiallelic-generated VCFs
'''
SNPcaller = "c"
#SNPcaller = "m"
'''
You can also "remove" any reads: these will be marked as "failed"
This only works during a "merge run"
eg. replace a set of reads with their qc-ed version
replaceReads ="read_set_2,read_set_3,read_set_24"
'''
replaceReads = ""
#replaceReads = "pool8_tag1"
#replaceReads = "pool1_tag1,pool1_tag3"
'''
Minimum depth of reads for variant filtering
Default value:
minimum_depth = 5
'''
minimum_depth = 5
'''
HetsVCF
The pipeline filters out heterozygous SNP calls.
To capture these SNPs in the form of a VCF (one per isolate),
set the following to 'True' (Now default behaviour)
'''
#HetsVCF = False
HetsVCF = True
'''
Values for calling the pass/fail and ingroup/outgroup status of strains
suggested (default) values
cover_fail = 50
depth_fail = 10
mapped_fail = 50
sd_out = 2
'''
cover_fail = 50
depth_fail = 10
mapped_fail = 50
sd_out = 2
'''
Strand Bias Cutoff value
ie. if ABS(DP4[2]-DP4[3])/(DP4[2]+DP4[3]) < strand_bias_cutoff, include the SNP.
Set to >1 to turn strand bias filtering off.
'''
strand_bias_cutoff = 0.8
'''
To switch on or off the checking of percentage of reads mapped
use the following 'check_reads_mapped'.
By default (i.e. set to "") the pipeline will use the largest replicon.
If set to "off", there will be no check for percentage of reads mapped.
Otherwise, give list of the n replicons to be checked, followed by an 'x'
followed by the ratio of the first n-1 replicons
For a single replicon, just put the replicon.
e.g. check_reads_mapped = "rep_1"
or check_reads_mapped = "rep_1,rep_2,rep_3,x,0.45,0.3"
i.e. rep1 is 45% of the total genome, rep2 is 30% of the total genome,
and rep3 is 25% of the total genome (by default).
Note: there must be no spaces in the list.
'''
check_reads_mapped = ""
#check_reads_mapped = "off"
#check_reads_mapped = "rep_1"
#check_reads_mapped = "rep_1,rep_2,x,0.6"
#check_reads_mapped = "rep_1,rep_2,rep_3,x,0.45,0.3"
#check_reads_mapped = "CP002555,CP002556,x,0.75"
'''
During allele matrix filtering, you can set the conservation level for missing alleles
this is a ratio between 1.0 (100% conservation: remove all SNPs with even one missing allele call)
and 0.0 (0% conservation: remove no SNPs). By default, the pipeline produces the 95% and
0% conservation matrices, with downstream analysis on the 95% matrix.
By entering a different conservation level (e.g. 0.85), both the 95% and 0% matrices
will still be produced, but so too will the 85% matrix (in this example),
and downsteam analysis carried out on this matrix.
'''
conservation = 0.95
'''
Difference Matrix
The pipeline can produce a difference matrix. Currently this is a pairwise difference count.
To get the difference matrix, set the following to 'True'.
'''
DifferenceMatrix = False
#DifferenceMatrix = True
'''
########################
Rubra pipeline variables (do not delete!):
########################
- logDir: the directory where batch queue scripts, stdout and sterr dumps are stored.
- logFile: the file used to log all jobs that are run.
- style: the default style, one of 'flowchart', 'print', 'run', 'touchfiles'. Can be
overridden by specifying --style on the command line.
- procs: the number of python processes to run simultaneously. This determines the
maximum parallelism of the pipeline. For distributed jobs it also constrains the
maximum total jobs submitted to the queue at any one time.
- verbosity: one of 0 (quiet), 1 (normal), 2 (chatty). Can be overridden by specifying
--verbose on the command line.
- end: the desired tasks to be run. Rubra will also run all tasks which are dependencies
of these tasks. Can be overridden by specifying --end on the command line.
- force: tasks which will be forced to run, regardless of timestamps. Can be overridden
by supplying --force on the command line.
- rebuild: one of 'fromstart','fromend'. Whether to calculate which dependencies will
be rerun by working back from an end task to the latest up-to-date task, or forward
from the earliest out-of-date task. 'fromstart' is the most conservative and
commonly used as it brings all intermediate tasks up to date.
'''
pipeline = {
"logDir": "log",
"logFile": "pipeline.log",
"style": "print",
"procs": 20,
"paired": True,
"verbose": 1,
"end": ["deleteDir"],
"force": [],
"rebuild" : "fromstart"
}
stageDefaults = {
"distributed": False,
"walltime": "01:00:00",
"memInGB": 4,
"queue": "normal",
"modules": [
"python/2.7.15-gcc5",
"bwa/0.7.17-gcc5",
"samtools/1.9-gcc5",
"bcftools/1.8",
"ea-utils/1.1.2-gcc5",
"fasttree/2.1.10",
"bowtie2/2.2.9"
]
}
stages = {
"makeDir": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python makeDir.py %out %sequence_list"
},
"copyRef": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "cp %ref %newRef"
},
"makeRef": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python convertGenbankToFasta.py %gen %ref"
},
"buildBowtieIndex": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "bowtie2-build %ref %base"
},
"alignBowtiePE": {
"walltime": "03:00:00",
# large file size (any read set >800MB)
# "walltime": "08:00:00",
"command": "bowtie2 %type -x %ref_base -1 %seq1 -2 %seq2 -X %Xvalue | samtools view -ubS - | samtools sort - -o %out"
},
"alignBowtie": {
"walltime": "03:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "bowtie2 %type -x %ref_base -U %seq | samtools view -ubS - | samtools sort - -o %out"
},
"buildBWAIndex": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "bwa index -a is %ref"
},
"alignSequence": {
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "bwa aln %ref %seq > %out"
},
"alignBWAPE": {
"walltime": "03:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "bwa sampe %ref %align1 %align2 %seq1 %seq2 | samtools view -ubS - | samtools sort - -o %out"
},
"alignBWASE": {
"walltime": "03:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "bwa samse %ref %align %seq | samtools view -ubS - | samtools sort - -o %out"
},
"checkBam": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python checkBam.py %type %bam %seq"
},
"indexBam": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "samtools index %bam"
},
"filterUnmapped": {
"walltime": "00:15:00",
"queue": "shortq",
"command": "samtools view -hub -F 4 %bam | samtools sort - -o %out"
},
"getSamStats": {
"walltime": "00:20:00",
"queue": "shortq",
"command": "sam-stats -A -B %bam > %out"
},
"indexRef": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "samtools faidx %ref"
},
"callRepSNPs": {
"walltime": "01:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "samtools mpileup -u -t DP -f %ref %bam -r %replicon | bcftools call -O b %option - > %out"
},
"checkpoint": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python checkpoint.py %outTemp %stage"
},
"getConsensus": {
"walltime": "01:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "samtools mpileup -q 20 -ugB -f %ref %bam | bcftools call -c - | vcfutils.pl vcf2fq > %output"
},
"getCoverage": {
"walltime": "01:00:00",
# large file size (any read set >800MB)
# "walltime": "06:00:00",
"command": "samtools mpileup %bam | cut - -f 1-4 > %out"
},
"getCoverByRep": {
"command": "python getCoverByRep.py %ref %coverage %out"
},
"q30VarFilterPE": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "bcftools view -i 'ABS(DP4[2]-DP4[3])/(DP4[2]+DP4[3]) < %bias_cutoff ' %bcfFile | vcfutils.pl varFilter -d %min -D %cover -Q 30 > %out"
},
"q30VarFilter": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "bcftools view %bcfFile | vcfutils.pl varFilter -d %min -D %cover -Q 30 > %out"
},
"finalFilter": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python finalFilter.py %vcfFile %out %het %flag"
},
"getVcfStats": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python getVcfStats.py %vcfFile %out"
},
"deriveRepStats": {
"walltime": "00:20:00",
"queue": "shortq",
"command": "python deriveRepStats.py %coverFile %replicon %depth %cover %runType %map %check"
},
"deriveAllStats": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python deriveAllStats.py %coverFile"
},
"collateRepStats": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python collateRepStats.py %ref %in %replicon %multiplier %runType %sequence_list"
},
"collateAllStats": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "python collateAllStats.py %ref %in %path %sequence_list"
},
"mergeOutputs": {
"command": "cp %inputBam %outDirBam && cp %inputIndex %outDirBam && cp %inputVcf %outDirVcf"
},
"mergeAllStats": {
"command": "python mergeAllStats.py %newStats %mergeDir"
},
"mergeRepStats": {
"command": "python mergeRepStats.py %newStats %multiplier %replace %mergeDir %runType"
},
"getRepSNPList": {
"command": "python getRepSNPList.py %in %replicon %out"
},
"deriveAllRepGeneCover": {
"walltime": "00:15:00",
"queue": "shortq",
"command": "python deriveAllRepGeneCover.py %outDir %genbank %in"
},
"collateAllRepGeneCover": {
# large data sets (more than 150 samples)
# "walltime": "03:00:00",
"command": "python collateAllRepGeneCover.py %inDir %outDir %refName %sequence_list"
},
"mergeAllRepGeneCover": {
"walltime": "00:10:00",
"queue": "shortq",
# large data sets
# "walltime": "03:00:00",
# "queue": "normal",
"command": "python mergeAllRepGeneCover.py %inDir %outDir %refName %sequence_list"
},
"parseGeneContent": {
"walltime": "00:10:00",
"queue": "shortq",
# large data sets
# "walltime": "01:00:00",
# "queue": "normal",
"command": "python parseGeneContent.py -g %input -d %input2 -s %out -o %out2"
},
"deriveRepAlleleMatrix": {
# large number of SNPs (20000+)
# "walltime": "02:00:00",
"command": "python deriveRepAlleleMatrix.py %in %out %ref %replicon %consensus %repStats %merge_prefix"
},
"collateRepAlleleMatrix": {
"command": "python collateRepAlleleMatrix.py %in %out %sequence_list %rep_name"
},
"getDifferenceMatrix": {
"walltime": "00:10:00",
"queue": "shortq",
# large data sets
# "walltime": "04:00:00",
# "queue": "normal",
"command": "python make_distance_matrix.py -i %in"
},
"parseSNPs": {
# large data sets
# "memInGB": 8,
"command": "python parseSNPtable.py -m cons,aln,coding -s %input -c %conservation -r %genbank -q %replicon -d %dir"
},
"parseSNPsNoGBK": {
# large data sets
# "memInGB": 8,
"command": "python parseSNPtable.py -m cons,aln -s %input -c %conservation -d %dir"
},
"makeTree": {
"walltime": "01:00:00",
# large data sets
# "walltime": "06:00:00",
# "memInGB": 8,
"command": "FastTreeDbl -gtr -gamma -nt %input > %output"
},
"makeNoTree": {
"command": "python make_no_tree.py %input %out"
},
"deleteDir": {
"walltime": "00:10:00",
"queue": "shortq",
"command": "rm -rf %directory"
}
}
#end of config file