forked from ENCODE-DCC/chip-seq-pipeline2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chip.wdl
2979 lines (2804 loc) · 133 KB
/
chip.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
version 1.0
workflow chip {
String pipeline_ver = 'v1.6.1'
meta {
author: 'Jin wook Lee ([email protected]) at ENCODE-DCC'
description: 'ENCODE TF/Histone ChIP-Seq pipeline'
specification_document: 'https://docs.google.com/document/d/1lG_Rd7fnYgRpSIqrIfuVlAz2dW1VaSQThzk836Db99c/edit?usp=sharing'
caper_docker: 'encodedcc/chip-seq-pipeline:v1.6.1'
caper_singularity: 'docker://encodedcc/chip-seq-pipeline:v1.6.1'
croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.v5.json'
parameter_group: {
pipeline_metadata: {
title: 'Pipeline metadata',
description: 'Metadata for a pipeline (e.g. title and description).'
},
reference_genome: {
title: 'Reference genome',
description: 'Genome specific files. e.g. reference FASTA, bowtie2 index, chromosome sizes file.',
help: 'Choose one chip.genome_tsv file that defines all genome specific parameters in it or define each genome specific parameter in input JSON to override those defined in genome TSV file. If you use Caper then use https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/[GENOME]_caper.tsv. Caper will automatically download/install all files defined in such TSV. Otherwise download genome TSV file by using a shell script (scripts/download_genome_data.sh [GENOME] [DEST_DIR]). Supported genomes are hg38, hg19, mm10 and mm9. See pipeline documentation if you want to build genome database from your own FASTA file. If some genome data are missing then analyses using such data will be skipped.'
},
input_genomic_data: {
title: 'Input genomic data',
description: 'Genomic input files for experiment.',
help: 'Pipeline can start with any types of experiment data (e.g. FASTQ, BAM, NODUP_BAM, TAG-ALIGN, PEAK). Choose one type and leave others empty. FASTQs have a variable for each biological replicate. e.g. chip.fastqs_rep1_R1 and chip.fastqs_rep2_R1. You can define up to 10 experiment replicates. For other types, there is an array to define file for each biological replicate. e.g. chip.bams: ["rep1.bam", "rep1.bam"]. Define sequential endedness with chip.paired_end, if you have mixed SE and PE replicates then define chip.paired_ends instead for each replicate. e.g. chip.paired_ends: [false, true].'
},
input_genomic_data_control: {
title: 'Input genomic data (control)',
description: 'Genomic input files for control. TF ChIP-seq requires control for peak calling but histone ChIP-seq does not.',
help: 'Pipeline can start with any types of control data (e.g. FASTQ, BAM, NODUP_BAM, TAG-ALIGN). Choose one type and leave others empty. FASTQs have a variable for each control replicate. e.g. chip.ctl_fastqs_rep1_R1 and chip.ctl_fastqs_rep2_R1. You can define up to 10 control replicates. For other types, there is an array to define file for each control replicate. e.g. chip.ctl_bams: ["ctl1.bam", "ctl1.bam"]. Define sequential endedness with chip.ctl_paired_end, if you have mixed SE and PE control replicates then define chip.ctl_paired_ends instead for each replicate. e.g. chip.ctl_paired_ends: [false, true]. If none of these are defined, pipeline will use chip.paired_end for controls.'
},
pipeline_parameter: {
title: 'Pipeline parameter',
description: 'Pipeline type and flags to turn on/off analyses.',
help: 'Use chip.align_only to align FASTQs without peak calling.'
},
alignment: {
title: 'Alignment',
description: 'Parameters for alignment.',
help: 'Pipeline can crop FASTQs (chip.crop_length > 0) with tolerance (chip.crop_length_tol) before mapping.'
},
peak_calling: {
title: 'Peak calling',
description: 'Parameters for peak calling.',
help: 'This group includes statistical thresholds for peak-calling or post-peak-calling analyses: p-val, FDR, IDR. It also include parameters for control choosing/subsampling. All control replicates are pooled and pooled control is used for peak calling against each experiment replicate by default (see chip.always_use_pooled_ctl). Pipeline compares read depth of experiment replicate and a chosen control. It also compare read depth of controls. If control is too deep then it is subsampled.'
},
resource_parameter: {
title: 'Resource parameter',
description: 'Number of CPUs (threads), max. memory and walltime for tasks.',
help: 'Resource settings are used for determining an instance type on cloud backends (e.g. GCP, AWS) and used for submitting tasks to a cluster engine (e.g. SLURM, SGE, ...). Walltime (chip.*_time_hr) is only used for cluster engines. Other tasks default to use 1 CPU and 4GB of memory.'
}
}
}
input {
# group: pipeline_metadata
String title = 'Untitled'
String description = 'No description'
# group: reference_genome
File? genome_tsv
String? genome_name
File? ref_fa
File? bwa_idx_tar
File? bowtie2_idx_tar
File? chrsz
File? blacklist
File? blacklist2
String? mito_chr_name
String? regex_bfilt_peak_chr_name
String? gensz
File? custom_aligner_idx_tar
# group: input_genomic_data
Boolean? paired_end
Array[Boolean] paired_ends = []
Array[File] fastqs_rep1_R1 = []
Array[File] fastqs_rep1_R2 = []
Array[File] fastqs_rep2_R1 = []
Array[File] fastqs_rep2_R2 = []
Array[File] fastqs_rep3_R1 = []
Array[File] fastqs_rep3_R2 = []
Array[File] fastqs_rep4_R1 = []
Array[File] fastqs_rep4_R2 = []
Array[File] fastqs_rep5_R1 = []
Array[File] fastqs_rep5_R2 = []
Array[File] fastqs_rep6_R1 = []
Array[File] fastqs_rep6_R2 = []
Array[File] fastqs_rep7_R1 = []
Array[File] fastqs_rep7_R2 = []
Array[File] fastqs_rep8_R1 = []
Array[File] fastqs_rep8_R2 = []
Array[File] fastqs_rep9_R1 = []
Array[File] fastqs_rep9_R2 = []
Array[File] fastqs_rep10_R1 = []
Array[File] fastqs_rep10_R2 = []
Array[File?] bams = []
Array[File?] nodup_bams = []
Array[File?] tas = []
Array[File?] peaks = []
Array[File?] peaks_pr1 = []
Array[File?] peaks_pr2 = []
File? peak_ppr1
File? peak_ppr2
File? peak_pooled
Boolean? ctl_paired_end
Array[Boolean] ctl_paired_ends = []
Array[File] ctl_fastqs_rep1_R1 = []
Array[File] ctl_fastqs_rep1_R2 = []
Array[File] ctl_fastqs_rep2_R1 = []
Array[File] ctl_fastqs_rep2_R2 = []
Array[File] ctl_fastqs_rep3_R1 = []
Array[File] ctl_fastqs_rep3_R2 = []
Array[File] ctl_fastqs_rep4_R1 = []
Array[File] ctl_fastqs_rep4_R2 = []
Array[File] ctl_fastqs_rep5_R1 = []
Array[File] ctl_fastqs_rep5_R2 = []
Array[File] ctl_fastqs_rep6_R1 = []
Array[File] ctl_fastqs_rep6_R2 = []
Array[File] ctl_fastqs_rep7_R1 = []
Array[File] ctl_fastqs_rep7_R2 = []
Array[File] ctl_fastqs_rep8_R1 = []
Array[File] ctl_fastqs_rep8_R2 = []
Array[File] ctl_fastqs_rep9_R1 = []
Array[File] ctl_fastqs_rep9_R2 = []
Array[File] ctl_fastqs_rep10_R1 = []
Array[File] ctl_fastqs_rep10_R2 = []
Array[File?] ctl_bams = []
Array[File?] ctl_nodup_bams = []
Array[File?] ctl_tas = []
# group: pipeline_parameter
String pipeline_type
Boolean align_only = false
Boolean true_rep_only = false
Boolean enable_count_signal_track = false
Boolean enable_jsd = true
Boolean enable_gc_bias = true
# group: alignment
String aligner = 'bowtie2'
File? custom_align_py
Boolean use_bwa_mem_for_pe = false
Int crop_length = 0
Int crop_length_tol = 2
Int xcor_trim_bp = 50
Boolean use_filt_pe_ta_for_xcor = false
String dup_marker = 'picard'
Boolean no_dup_removal = false
Int mapq_thresh = 30
Array[String] filter_chrs = []
Int subsample_reads = 0
Int ctl_subsample_reads = 0
Int xcor_subsample_reads = 15000000
Int xcor_exclusion_range_min = -500
Int? xcor_exclusion_range_max
# group: peak_calling
Int ctl_depth_limit = 200000000
Float exp_ctl_depth_ratio_limit = 5.0
Array[Int?] fraglen = []
String? peak_caller
Boolean always_use_pooled_ctl = true
Float ctl_depth_ratio = 1.2
Int? cap_num_peak
Float pval_thresh = 0.01
Float fdr_thresh = 0.01
Float idr_thresh = 0.05
# group: resource_parameter
Int align_cpu = 6
Float align_bowtie2_mem_factor = 0.15
Float align_bwa_mem_factor = 0.15
Int align_time_hr = 48
Float align_bowtie2_disk_factor = 8.0
Float align_bwa_disk_factor = 8.0
Int filter_cpu = 4
Float filter_mem_factor = 0.4
Int filter_time_hr = 24
Float filter_disk_factor = 6.0
Int bam2ta_cpu = 2
Float bam2ta_mem_factor = 0.35
Int bam2ta_time_hr = 6
Float bam2ta_disk_factor = 4.0
Float spr_mem_factor = 4.5
Float spr_disk_factor = 6.0
Int jsd_cpu = 4
Float jsd_mem_factor = 0.1
Int jsd_time_hr = 6
Float jsd_disk_factor = 2.0
Int xcor_cpu = 2
Float xcor_mem_factor = 1.0
Int xcor_time_hr = 24
Float xcor_disk_factor = 4.5
Float subsample_ctl_mem_factor = 7.0
Float subsample_ctl_disk_factor = 7.5
Float macs2_signal_track_mem_factor = 6.0
Int macs2_signal_track_time_hr = 24
Float macs2_signal_track_disk_factor = 40.0
Int call_peak_cpu = 6
Float call_peak_spp_mem_factor = 5.0
Float call_peak_macs2_mem_factor = 2.5
Int call_peak_time_hr = 72
Float call_peak_spp_disk_factor = 5.0
Float call_peak_macs2_disk_factor = 15.0
String? align_trimmomatic_java_heap
String? filter_picard_java_heap
String? gc_bias_picard_java_heap
}
parameter_meta {
title: {
description: 'Experiment title.',
group: 'pipeline_metadata',
example: 'ENCSR936XTK (subsampled 1/50)'
}
description: {
description: 'Experiment description.',
group: 'pipeline_metadata',
example: 'ZNF143 ChIP-seq on human GM12878 (subsampled 1/50)'
}
genome_tsv: {
description: 'Reference genome database TSV.',
group: 'reference_genome',
help: 'This TSV files includes all genome specific parameters (e.g. reference FASTA, bowtie2 index). You can still invidiaully define any parameters in it. Parameters defined in input JSON will override those defined in genome TSV.',
example: 'https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/hg38_caper.tsv'
}
genome_name: {
description: 'Genome name.',
group: 'reference_genome'
}
ref_fa: {
description: 'Reference FASTA file.',
group: 'reference_genome'
}
bowtie2_idx_tar: {
description: 'BWA index TAR file.',
group: 'reference_genome'
}
custom_aligner_idx_tar: {
description: 'Index TAR file for a custom aligner. To use a custom aligner, define "chip.custom_align_py" too.',
group: 'reference_genome'
}
chrsz: {
description: '2-col chromosome sizes file.',
group: 'reference_genome'
}
blacklist: {
description: 'Blacklist file in BED format.',
group: 'reference_genome',
help: 'Peaks will be filtered with this file.'
}
blacklist2: {
description: 'Secondary blacklist file in BED format.',
group: 'reference_genome',
help: 'If it is defined, it will be merged with chip.blacklist. Peaks will be filtered with merged blacklist.'
}
mito_chr_name: {
description: 'Mitochondrial chromosome name.',
group: 'reference_genome',
help: 'e.g. chrM, MT. Mitochondrial reads defined here will be filtered out during filtering BAMs in "filter" task.'
}
regex_bfilt_peak_chr_name: {
description: 'Reg-ex for chromosomes to keep while filtering peaks.',
group: 'reference_genome',
help: 'Chromosomes defined here will be kept. All other chromosomes will be filtered out in .bfilt. peak file. This is done along with blacklist filtering peak file.'
}
gensz: {
description: 'Genome sizes. "hs" for human, "mm" for mouse or sum of 2nd columnin chromosome sizes file.',
group: 'reference_genome'
}
paired_end: {
description: 'Sequencing endedness.',
group: 'input_genomic_data',
help: 'Setting this on means that all replicates are paired ended. For mixed samples, use chip.paired_ends array instead.',
example: true
}
paired_ends: {
description: 'Sequencing endedness array (for mixed SE/PE datasets).',
group: 'input_genomic_data',
help: 'Whether each biological replicate is paired ended or not.'
}
fastqs_rep1_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 1.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from FASTQs files. Pipeline can start from any type of inputs (e.g. FASTQs, BAMs, ...). Choose one type and fill paramters for that type and leave other undefined. Especially for FASTQs, we have individual variable for each biological replicate to allow FASTQs of technical replicates can be merged. Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep1_R2). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.50.fastq.gz'
]
}
fastqs_rep1_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 1.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep1_R1). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.50.fastq.gz'
]
}
fastqs_rep2_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 2.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep2_R2). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.50.fastq.gz'
]
}
fastqs_rep2_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 2.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep2_R1). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.50.fastq.gz'
]
}
fastqs_rep3_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 3.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep3_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep3_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 3.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep3_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep4_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 4.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep4_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep4_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 4.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep4_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep5_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 5.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep5_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep5_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 5.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep5_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep6_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 6.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep6_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep6_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 6.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep6_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep7_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 7.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep7_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep7_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 7.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep7_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep8_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 8.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep8_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep8_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 8.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep8_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep9_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 9.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep9_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep9_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 9.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep9_R1). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep10_R1: {
description: 'Read1 FASTQs to be merged for a biological replicate 10.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read2 FASTQs (chip.fastqs_rep10_R2). These FASTQs are usually technical replicates to be merged.'
}
fastqs_rep10_R2: {
description: 'Read2 FASTQs to be merged for a biological replicate 10.',
group: 'input_genomic_data',
help: 'Make sure that they are consistent with read1 FASTQs (chip.fastqs_rep10_R1). These FASTQs are usually technical replicates to be merged.'
}
bams: {
description: 'List of unfiltered/raw BAM files for each biological replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from BAM files. Unfiltered/raw BAM file generated from aligner (e.g. bowtie2). Each entry for each biological replicate. e.g. [rep1.bam, rep2.bam, rep3.bam, ...].'
}
nodup_bams: {
description: 'List of filtered/deduped BAM files for each biological replicate',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from filtered BAM files. Filtered/deduped BAM file. Each entry for each biological replicate. e.g. [rep1.nodup.bam, rep2.nodup.bam, rep3.nodup.bam, ...].'
}
tas: {
description: 'List of TAG-ALIGN files for each biological replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from TAG-ALIGN files. TAG-ALIGN is in a 6-col BED format. It is a simplified version of BAM. Each entry for each biological replicate. e.g. [rep1.tagAlign.gz, rep2.tagAlign.gz, ...].'
}
peaks: {
description: 'List of NARROWPEAK files (not blacklist filtered) for each biological replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Each entry for each biological replicate. e.g. [rep1.narrowPeak.gz, rep2.narrowPeak.gz, ...]. Define other PEAK parameters (e.g. chip.peaks_pr1, chip.peak_pooled) according to your flag settings (e.g. chip.true_rep_only) and number of replicates. If you have more than one replicate then define chip.peak_pooled, chip.peak_ppr1 and chip.peak_ppr2. If chip.true_rep_only flag is on then do not define any parameters (chip.peaks_pr1, chip.peaks_pr2, chip.peak_ppr1 and chip.peak_ppr2) related to pseudo replicates.'
}
peaks_pr1: {
description: 'List of NARROWPEAK files (not blacklist filtered) for pseudo-replicate 1 of each biological replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Define if chip.true_rep_only flag is off.'
}
peaks_pr2: {
description: 'List of NARROWPEAK files (not blacklist filtered) for pseudo-replicate 2 of each biological replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Define if chip.true_rep_only flag is off.'
}
peak_pooled: {
description: 'NARROWPEAK file for pooled true replicate.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Define if you have multiple biological replicates. Pooled true replicate means analysis on pooled biological replicates.'
}
peak_ppr1: {
description: 'NARROWPEAK file for pooled pseudo replicate 1.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Define if you have multiple biological replicates and chip.true_rep_only flag is off. PPR1 means analysis on pooled 1st pseudo replicates. Each biological replicate is shuf/split into two pseudos. This is a pooling of each replicate\'s 1st pseudos.'
}
peak_ppr2: {
description: 'NARROWPEAK file for pooled pseudo replicate 2.',
group: 'input_genomic_data',
help: 'Define if you want to start pipeline from PEAK files. Define if you have multiple biological replicates and chip.true_rep_only flag is off. PPR1 means analysis on pooled 2nd pseudo replicates. Each biological replicate is shuf/split into two pseudos. This is a pooling of each replicate\'s 2nd pseudos.'
}
ctl_paired_end: {
description: 'Sequencing endedness for all controls.',
group: 'input_genomic_data_control',
help: 'Setting this on means that all control replicates are paired ended. For mixed controls, use chip.ctl_paired_ends array instead.'
}
ctl_paired_ends: {
description: 'Sequencing endedness array for mixed SE/PE controls.',
group: 'input_genomic_data_control',
help: 'Whether each control replicate is paired ended or not.'
}
ctl_fastqs_rep1_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 1.',
group: 'input_genomic_data_control',
help: 'Define if you want to start pipeline from FASTQs files. Pipeline can start from any type of controls (e.g. FASTQs, BAMs, ...). Choose one type and fill paramters for that type and leave other undefined. Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep1_R2).',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz'
]
}
ctl_fastqs_rep1_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 1.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep1_R1). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz'
]
}
ctl_fastqs_rep2_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 2.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep2_R2). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz'
]
}
ctl_fastqs_rep2_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 2.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep2_R1). These FASTQs are usually technical replicates to be merged.',
example: [
'https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz'
]
}
ctl_fastqs_rep3_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 3.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep3_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep3_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 3.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep3_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep4_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 4.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep4_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep4_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 4.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep4_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep5_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 5.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep5_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep5_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 5.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep5_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep6_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 6.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep6_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep6_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 6.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep6_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep7_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 7.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep7_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep7_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 7.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep7_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep8_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 8.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep8_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep8_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 8.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep8_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep9_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 9.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep9_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep9_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 9.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep9_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep10_R1: {
description: 'Read1 FASTQs to be merged for a control replicate 10.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read2 FASTQs (chip.ctl_fastqs_rep10_R2). These FASTQs are usually technical replicates to be merged.'
}
ctl_fastqs_rep10_R2: {
description: 'Read2 FASTQs to be merged for a control replicate 10.',
group: 'input_genomic_data_control',
help: 'Make sure that they are consistent with read1 FASTQs (chip.ctl_fastqs_rep10_R1). These FASTQs are usually technical replicates to be merged.'
}
ctl_bams: {
description: 'List of unfiltered/raw BAM files for each control replicate.',
group: 'input_genomic_data_control',
help: 'Define if you want to start pipeline from BAM files. Unfiltered/raw BAM file generated from aligner (e.g. bowtie2). Each entry for each control replicate. e.g. [ctl1.bam, ctl2.bam, ctl3.bam, ...].'
}
ctl_nodup_bams: {
description: 'List of filtered/deduped BAM files for each control replicate',
group: 'input_genomic_data_control',
help: 'Define if you want to start pipeline from filtered BAM files. Filtered/deduped BAM file. Each entry for each control replicate. e.g. [ctl1.nodup.bam, ctl2.nodup.bam, ctl3.nodup.bam, ...].'
}
ctl_tas: {
description: 'List of TAG-ALIGN files for each biological replicate.',
group: 'input_genomic_data_control',
help: 'Define if you want to start pipeline from TAG-ALIGN files. TAG-ALIGN is in a 6-col BED format. It is a simplified version of BAM. Each entry for each control replicate. e.g. [ctl1.tagAlign.gz, ctl2.tagAlign.gz, ...].'
}
pipeline_type: {
description: 'Pipeline type. tf for TF ChIP-Seq, histone for Histone ChIP-Seq or control for mapping controls only.',
group: 'pipeline_parameter',
help: 'Default peak caller is different for each type. spp For TF ChIP-Seq and macs2 for histone ChIP-Seq. Regardless of pipeline type, spp always requires controls but macs2 doesn\'t. For control mode, chip.align_only is automatically turned on and cross-correlation analysis is disabled. Do not define ctl_* for control mode. Define fastqs_repX_RY instead.',
choices: ['tf', 'histone', 'control'],
example: 'tf'
}
align_only: {
description: 'Align only mode.',
group: 'pipeline_parameter',
help: 'Reads will be aligned but there will be no peak-calling on them. It is turned on automatically if chip.pipeline_type is control.'
}
true_rep_only: {
description: 'Disables all analyses related to pseudo-replicates.',
group: 'pipeline_parameter',
help: 'Pipeline generates 2 pseudo-replicate from one biological replicate. This flag turns off all analyses related to pseudos (with prefix/suffix pr, ppr).'
}
enable_count_signal_track: {
description: 'Enables generation of count signal tracks.',
group: 'pipeline_parameter'
}
enable_jsd: {
description: 'Enables Jensen-Shannon Distance (JSD) plot generation.',
group: 'pipeline_parameter'
}
enable_gc_bias: {
description: 'Enables GC bias calculation.',
group: 'pipeline_parameter'
}
aligner: {
description: 'Aligner. bowtie2, bwa or custom',
group: 'alignment',
help: 'It is bowtie2 by default. To use a custom aligner, define chip.custom_align_py and chip.custom_aligner_idx_tar.',
choices: ['bowtie2', 'bwa', 'custom'],
example: 'bowtie2'
}
custom_align_py: {
description: 'Python script for a custom aligner.',
group: 'alignment',
help: 'There is a template included in the documentation for inputs. Defining this parameter will automatically change "chip.aligner" to "custom". You should also define "chip.custom_aligner_idx_tar".'
}
use_bwa_mem_for_pe: {
description: 'For paired end dataset with read length >= 70bp, use bwa mem instead of bwa aln.',
group: 'alignment',
help: 'Use it only for paired end reads >= 70bp.'
}
crop_length: {
description: 'Crop FASTQs\' reads longer than this length.',
group: 'alignment',
help: 'Also drop all reads shorter than chip.crop_length - chip.crop_length_tol.'
}
crop_length_tol: {
description: 'Tolerance for cropping reads in FASTQs.',
group: 'alignment',
help: 'Drop all reads shorter than chip.crop_length - chip.crop_length_tol. Activated only when chip.crop_length is defined.'
}
xcor_trim_bp: {
description: 'Trim experiment read1 FASTQ (for both SE and PE) for cross-correlation analysis.',
group: 'alignment',
help: 'This does not affect alignment of experimental/control replicates. Pipeline additionaly aligns R1 FASTQ only for cross-correlation analysis only. This parameter is used for it.'
}
use_filt_pe_ta_for_xcor: {
description: 'Use filtered PE BAM for cross-correlation analysis.',
group: 'alignment',
help: 'If not defined, pipeline uses SE BAM generated from trimmed read1 FASTQ for cross-correlation analysis.'
}
dup_marker: {
description: 'Marker for duplicate reads. picard or sambamba.',
group: 'alignment',
help: 'picard for Picard MarkDuplicates or sambamba for sambamba markdup.',
choices: ['picard', 'sambamba'],
example: 'picard'
}
no_dup_removal: {
description: 'Disable removal of duplicate reads during filtering BAM.',
group: 'alignment',
help: 'Duplicate reads are filtererd out during filtering BAMs to gerenate NODUP_BAM. This flag will keep all duplicate reads in NODUP_BAM. This flag does not affect naming of NODUP_BAM. NODUP_BAM will still have .nodup. suffix in its filename.'
}
mapq_thresh: {
description: 'Threshold for low MAPQ reads removal.',
group: 'alignment',
help: 'Low MAPQ reads are filtered out while filtering BAM.'
}
filter_chrs: {
description: 'List of chromosomes to be filtered out while filtering BAM.',
group: 'alignment',
help: 'It is empty by default, hence no filtering out of specfic chromosomes. It is case-sensitive. Use exact word for chromosome names.'
}
subsample_reads: {
description: 'Subsample reads. Shuffle and subsample reads.',
group: 'alignment',
help: 'This affects all downstream analyses after filtering experiment BAM. (e.g. all TAG-ALIGN files, peak-calling). Reads will be shuffled only if actual number of reads in BAM exceeds this number. 0 means disabled.'
}
ctl_subsample_reads: {
description: 'Subsample control reads. Shuffle and subsample control reads.',
group: 'alignment',
help: 'This affects all downstream analyses after filtering control BAM. (e.g. all TAG-ALIGN files, peak-calling). Reads will be shuffled only if actual number of reads in BAM exceeds this number. 0 means disabled.'
}
xcor_subsample_reads: {
description: 'Subsample reads for cross-corrlelation analysis only.',
group: 'alignment',
help: 'This does not affect downstream analyses after filtering BAM. It is for cross-correlation analysis only. 0 means disabled.'
}
xcor_exclusion_range_min: {
description: 'Exclusion minimum for cross-correlation analysis.',
group: 'alignment',
help: 'For run_spp.R -s. Make sure that it is consistent with default strand shift -s=-500:5:1500 in run_spp.R.'
}
xcor_exclusion_range_max: {
description: 'Exclusion maximum for cross-coorrelation analysis.',
group: 'alignment',
help: 'For run_spp.R -s. If not defined default value of `max(read length + 10, 50)` for TF and `max(read_len + 10, 100)` for histone are used'
}
ctl_depth_limit: {
description: 'Hard limit for chosen control\'s depth.',
group: 'peak_calling',
help: 'If control chosen by chip.always_use_pooled_ctl and chip.ctl_depth_ratio is deeper than this hard limit, then such control is subsampled.'
}
exp_ctl_depth_ratio_limit: {
description: 'Second limit for chosen control\'s depth.',
group: 'peak_calling',
help: 'If control chosen by chip.always_use_pooled_ctl and chip.ctl_depth_ratio is deeper than experiment replicate\'s read depth multiplied by this factor then such control is subsampled down to maximum of multiplied value and hard limit chip.ctl_depth_limit.'
}
fraglen: {
description: 'Fragment length for each biological replicate.',
group: 'peak_calling',
help: 'Fragment length is estimated by cross-correlation analysis, which is valid only when pipeline started from FASTQs. If defined, fragment length estimated by cross-correlation analysis is ignored.'
}
peak_caller: {
description: 'Peak caller.',
group: 'peak_calling',
help: 'It is spp and macs2 by default for TF ChIP-seq and histone ChIP-seq, respectively. e.g. you can use macs2 for TF ChIP-Seq even though spp is by default for TF ChIP-Seq (chip.pipeline_type == tf).',
choices: ['spp', 'macs2'],
example: 'spp'
}
always_use_pooled_ctl: {
description: 'Always choose a pooled control for each experiment replicate.',
group: 'peak_calling',
help: 'If turned on, ignores chip.ctl_depth_ratio.'
}
ctl_depth_ratio: {
description: 'Maximum depth ratio between control replicates.',
group: 'peak_calling',
help: 'If ratio of depth between any two controls is higher than this, then always use a pooled control for all experiment replicates.'
}
cap_num_peak: {
description: 'Upper limit on the number of peaks.',
group: 'peak_calling',
help: 'It is 30000000 and 50000000 by default for spp and macs2, respectively.'
}
pval_thresh: {
description: 'p-value Threshold for MACS2 peak caller.',
group: 'peak_calling',
help: 'macs2 callpeak -p'
}
fdr_thresh: {
description: 'FDR threshold for spp peak caller (phantompeakqualtools).',
group: 'peak_calling',
help: 'run_spp.R -fdr='
}
idr_thresh: {
description: 'IDR threshold.',
group: 'peak_calling'
}
align_cpu: {
description: 'Number of cores for task align.',
group: 'resource_parameter',
help: 'Task align merges/crops/maps FASTQs.'
}
align_bowtie2_mem_factor: {
description: 'Multiplication factor to determine memory required for task align with bowtie2 (default) as aligner.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of FASTQs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
align_bwa_mem_factor: {
description: 'Multiplication factor to determine memory required for task align with bwa as aligner.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of FASTQs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
align_time_hr: {
description: 'Walltime (h) required for task align.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
align_bowtie2_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task align with bowtie2 (default) as aligner.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of FASTQs to determine required disk size of instance on GCP/AWS.'
}
align_bwa_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task align with bwa as aligner.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of FASTQs to determine required disk size of instance on GCP/AWS.'
}
filter_cpu: {
description: 'Number of cores for task filter.',
group: 'resource_parameter',
help: 'Task filter filters raw/unfilterd BAM to get filtered/deduped BAM.'
}
filter_mem_factor: {
description: 'Multiplication factor to determine memory required for task filter.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of BAMs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
filter_time_hr: {
description: 'Walltime (h) required for task filter.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
filter_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task filter.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of BAMs to determine required disk size of instance on GCP/AWS.'
}
bam2ta_cpu: {
description: 'Number of cores for task bam2ta.',
group: 'resource_parameter',
help: 'Task bam2ta converts filtered/deduped BAM in to TAG-ALIGN (6-col BED) format.'
}
bam2ta_mem_factor: {
description: 'Multiplication factor to determine memory required for task bam2ta.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
bam2ta_time_hr: {
description: 'Walltime (h) required for task bam2ta.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
bam2ta_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task bam2ta.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required disk size of instance on GCP/AWS.'
}
spr_mem_factor: {
description: 'Multiplication factor to determine memory required for task spr.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
spr_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task spr.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required disk size of instance on GCP/AWS.'
}
jsd_cpu: {
description: 'Number of cores for task jsd.',
group: 'resource_parameter',
help: 'Task jsd plots Jensen-Shannon distance and metrics related to it.'
}
jsd_mem_factor: {
description: 'Multiplication factor to determine memory required for task jsd.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
jsd_time_hr: {
description: 'Walltime (h) required for task jsd.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
jsd_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task jsd.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of filtered BAMs to determine required disk size of instance on GCP/AWS.'
}
xcor_cpu: {
description: 'Number of cores for task xcor.',
group: 'resource_parameter',
help: 'Task xcor does cross-correlation analysis (including a plot) on subsampled TAG-ALIGNs.'
}
xcor_mem_factor: {
description: 'Multiplication factor to determine memory required for task xcor.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
xcor_time_hr: {
description: 'Walltime (h) required for task xcor.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
xcor_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task xcor.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required disk size of instance on GCP/AWS.'
}
subsample_ctl_mem_factor: {
description: 'Multiplication factor to determine memory required for task subsample_ctl.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
subsample_ctl_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task subsample_ctl.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required disk size of instance on GCP/AWS.'
}
call_peak_cpu: {
description: 'Number of cores for task call_peak. IF MACS2 is chosen as peak_caller (or chip.pipeline_type is histone), then cpu will be fixed at 2.',
group: 'resource_parameter',
help: 'Task call_peak call peaks on TAG-ALIGNs by using SPP/MACS2 peak caller. MACS2 is single-threaded so cpu will be fixed at 2 for MACS2.'
}
call_peak_spp_mem_factor: {
description: 'Multiplication factor to determine memory required for task call_peak with spp as peak_caller.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
call_peak_macs2_mem_factor: {
description: 'Multiplication factor to determine memory required for task call_peak with macs2 as peak_caller.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
call_peak_time_hr: {
description: 'Walltime (h) required for task call_peak.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
call_peak_spp_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task call_peak with spp as peak_caller.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required disk size of instance on GCP/AWS.'
}
call_peak_macs2_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task call_peak with macs2 as peak_caller.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required disk size of instance on GCP/AWS.'
}
macs2_signal_track_mem_factor: {
description: 'Multiplication factor to determine memory required for task macs2_signal_track.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required memory of instance (GCP/AWS) or job (HPCs).'
}
macs2_signal_track_time_hr: {
description: 'Walltime (h) required for task macs2_signal_track.',
group: 'resource_parameter',
help: 'This is for HPCs only. e.g. SLURM, SGE, ...'
}
macs2_signal_track_disk_factor: {
description: 'Multiplication factor to determine persistent disk size for task macs2_signal_track.',
group: 'resource_parameter',
help: 'This factor will be multiplied to the size of TAG-ALIGNs (BEDs) to determine required disk size of instance on GCP/AWS.'
}
align_trimmomatic_java_heap: {
description: 'Maximum Java heap (java -Xmx) in task align.',
group: 'resource_parameter',
help: 'Maximum memory for Trimmomatic. If not defined, 90% of align task\'s memory will be used.'
}
filter_picard_java_heap: {
description: 'Maximum Java heap (java -Xmx) in task filter.',
group: 'resource_parameter',
help: 'Maximum memory for Picard tools MarkDuplicates. If not defined, 90% of filter task\'s memory will be used.'
}
gc_bias_picard_java_heap: {
description: 'Maximum Java heap (java -Xmx) in task gc_bias.',
group: 'resource_parameter',
help: 'Maximum memory for Picard tools CollectGcBiasMetrics. If not defined, 90% of gc_bias task\'s memory will be used.'
}
}
# read genome data and paths
if ( defined(genome_tsv) ) {
call read_genome_tsv { input: genome_tsv = genome_tsv }
}
File ref_fa_ = select_first([ref_fa, read_genome_tsv.ref_fa])
File? bwa_idx_tar_ = if defined(bwa_idx_tar) then bwa_idx_tar
else read_genome_tsv.bwa_idx_tar
File bowtie2_idx_tar_ = select_first([bowtie2_idx_tar, read_genome_tsv.bowtie2_idx_tar])
File chrsz_ = select_first([chrsz, read_genome_tsv.chrsz])
String gensz_ = select_first([gensz, read_genome_tsv.gensz])
File? blacklist1_ = if defined(blacklist) then blacklist
else read_genome_tsv.blacklist
File? blacklist2_ = if defined(blacklist2) then blacklist2
else read_genome_tsv.blacklist2
# merge multiple blacklists
# two blacklists can have different number of columns (3 vs 6)
# so we limit merged blacklist's columns to 3
Array[File] blacklists = select_all([blacklist1_, blacklist2_])
if ( length(blacklists) > 1 ) {
call pool_ta as pool_blacklist { input:
tas = blacklists,
col = 3,
}
}
File? blacklist_ = if length(blacklists) > 1 then pool_blacklist.ta_pooled
else if length(blacklists) > 0 then blacklists[0]
else blacklist2_
String mito_chr_name_ = select_first([mito_chr_name, read_genome_tsv.mito_chr_name])
String regex_bfilt_peak_chr_name_ = select_first([regex_bfilt_peak_chr_name, read_genome_tsv.regex_bfilt_peak_chr_name])
String genome_name_ = select_first([genome_name, read_genome_tsv.genome_name, basename(chrsz_)])
### temp vars (do not define these)
String aligner_ = if defined(custom_align_py) then 'custom' else aligner
String peak_caller_ = if pipeline_type=='tf' then select_first([peak_caller, 'spp'])
else select_first([peak_caller, 'macs2'])
String peak_type_ = if peak_caller_=='spp' then 'regionPeak'
else 'narrowPeak'
Boolean enable_idr = pipeline_type=='tf' # enable_idr for TF chipseq only
String idr_rank_ = if peak_caller_=='spp' then 'signal.value'
else if peak_caller_=='macs2' then 'p.value'
else 'p.value'
Int cap_num_peak_spp = 300000
Int cap_num_peak_macs2 = 500000
Int cap_num_peak_ = if peak_caller_ == 'spp' then select_first([cap_num_peak, cap_num_peak_spp])
else select_first([cap_num_peak, cap_num_peak_macs2])
Int mapq_thresh_ = mapq_thresh
Boolean enable_xcor_ = if pipeline_type=='control' then false else true
Boolean enable_count_signal_track_ = if pipeline_type=='control' then false else enable_count_signal_track
Boolean enable_jsd_ = if pipeline_type=='control' then false else enable_jsd
Boolean enable_gc_bias_ = if pipeline_type=='control' then false else enable_gc_bias