biowdl · rhpvorderman · Nov 19, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 30, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,13 +8,14 @@ This document is user facing. Please word the changes in such a way
 that users understand how the changes affect the new version.
 -->
 
-version 5.3.0-dev
+version 6.0.0-dev
 ---------------------------
++ rtg Format and VcfEval tasks now handle reference as an array of files to enable caching.
++ Added --select-genotype and --exclude-filtered flags to GATK SelectVariants
 + Use softlinks to localise the database for centrifuge.
 + Added the FastqFilter task.
 + Added a new input `revcomp` to cutadapt to set the `--revcomp` flag, defaults to `false`.
 
-
 version 5.2.0
 ---------------------------
 + Update cutadapt version to 4.4

diff --git a/gatk.wdl b/gatk.wdl
@@ -1513,7 +1513,9 @@ task SelectVariants {
         String outputPath = "output.vcf.gz"
         Array[File] intervals = []
 
+        Boolean excludeFiltered = false
         String? selectTypeToInclude
+        String? selectGenotype
 
         String javaXmx = "4G"
         String memory = "5GiB"
@@ -1529,6 +1531,8 @@ task SelectVariants {
         -R ~{referenceFasta} \
         -V ~{inputVcf} \
         ~{"--select-type-to-include " + selectTypeToInclude} \
+        ~{"-select-genotype \"" + selectGenotype}~{true="\"" false="" defined(selectGenotype)} \
+        ~{true="--exclude-filtered" false="" excludeFiltered} \
         ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \
         -O ~{outputPath}
     }
@@ -1554,6 +1558,8 @@ task SelectVariants {
         outputPath: {description: "The location the output VCF file should be written.", category: "advanced"}
         intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"}
         selectTypeToInclude: {description: "Select only a certain type of variants from the input file.", category: "common"}
+        excludeFiltered: {description: "Remove all variants that do not have a PASS filter", category: "advanced"}
+        selectGenotype: {description: "The genotype to be selected", category: "advanced"}
         javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"}
         memory: {description: "The amount of memory this job will use.", category: "advanced"}
         timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}

diff --git a/rtg.wdl b/rtg.wdl
@@ -24,8 +24,7 @@ task Format {
     input {
         Array[File]+ inputFiles
         String format = "fasta"
-        String outputPath = "seq_data.sdf"
-
+        String outputPath = "reference_data"
         String rtgMem = "8G"
         String memory = "9GiB"
         Int timeMinutes = 1 + ceil(size(inputFiles, "GiB") * 2)
@@ -41,7 +40,7 @@ task Format {
     }
 
     output {
-        File sdf = outputPath
+        Array[File] referenceFiles = glob("~{outputPath}/*")
     }
 
     runtime {
@@ -61,7 +60,7 @@ task Format {
         dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}
 
         # outputs
-        sdf: {description: "RTGSequence Data File (SDF) format version of the input file(s)."}
+        referenceFiles: {description: "An array with all the generated reference files"}
     }
 }
 
@@ -74,7 +73,7 @@ task VcfEval {
         Boolean squashPloidy = false
         String outputMode = "split"
         String outputDir = "output/"
-        File template
+        Array[File] referenceFiles
         Boolean allRecords = false
         Boolean decompose = false
         Boolean refOverlap = false
@@ -99,7 +98,7 @@ task VcfEval {
         ~{"--evaluation-regions " + evaluationRegions} \
         ~{"--bed-regions " + bedRegions} \
         --output ~{outputDir} \
-        --template ~{template} \
+        --template $(dirname ~{referenceFiles[0]}) \
         ~{true="--all-records" false="" allRecords} \
         ~{true="--decompose" false="" decompose} \
         ~{true="--ref-overlap" false="" refOverlap} \
@@ -152,7 +151,7 @@ task VcfEval {
         squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences.", category: "common"}
         outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split).", category: "advanced"}
         outputDir: {description: "Directory for output.", category: "advanced"}
-        template: {description: "SDF of the reference genome the variants are called against.", category: "required"}
+        referenceFiles: {description: "An array of reference Files generated by the Format task.", category: "required"}
         allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\").", category: "common"}
         decompose: {description: "decompose complex variants into smaller constituents to allow partial credit.", category: "common"}
         refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap).", category: "common"}