From 7404b0e6f7470c4d04d80f7037f1068ad091d9ba Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Aug 2024 17:07:03 +0200 Subject: [PATCH 1/6] Add a selectGenotype switch --- gatk.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gatk.wdl b/gatk.wdl index 0b93efe6..a2aff322 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1514,6 +1514,7 @@ task SelectVariants { Array[File] intervals = [] String? selectTypeToInclude + String? selectGenotype String javaXmx = "4G" String memory = "5GiB" @@ -1529,6 +1530,7 @@ task SelectVariants { -R ~{referenceFasta} \ -V ~{inputVcf} \ ~{"--select-type-to-include " + selectTypeToInclude} \ + ~{"-select-genotype " + selectGenotype} \ ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \ -O ~{outputPath} } From d86d9cb89a8f8b74ad2b714a23e1686fd4f26e3d Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 27 Aug 2024 10:19:18 +0200 Subject: [PATCH 2/6] Quote select genotype value --- gatk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gatk.wdl b/gatk.wdl index a2aff322..f272a2f9 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1530,7 +1530,7 @@ task SelectVariants { -R ~{referenceFasta} \ -V ~{inputVcf} \ ~{"--select-type-to-include " + selectTypeToInclude} \ - ~{"-select-genotype " + selectGenotype} \ + ~{"-select-genotype \"" + selectGenotype}~{true="\"" false="" defined(selectGenotype)} \ ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \ -O ~{outputPath} } From 558c9b7d7370b0f46346c16beaa4d4cb3f48b09e Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 30 Aug 2024 15:23:55 +0200 Subject: [PATCH 3/6] Add exclude filtered expression --- gatk.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gatk.wdl b/gatk.wdl index f272a2f9..230674a5 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1513,6 +1513,7 @@ task SelectVariants { String outputPath = "output.vcf.gz" Array[File] intervals = [] + Boolean excludeFiltered = false String? selectTypeToInclude String? selectGenotype @@ -1531,6 +1532,7 @@ task SelectVariants { -V ~{inputVcf} \ ~{"--select-type-to-include " + selectTypeToInclude} \ ~{"-select-genotype \"" + selectGenotype}~{true="\"" false="" defined(selectGenotype)} \ + ~{true="--exclude-filtered" false="" excludeFiltered} \ ~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \ -O ~{outputPath} } From 75f36133cb52ce6f02701ff11612f6884a8d1726 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 18 Oct 2024 14:52:33 +0200 Subject: [PATCH 4/6] Use reference files in rtg-tools tasks to make tasks cacheable --- rtg.wdl | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/rtg.wdl b/rtg.wdl index 3e9dab9b..62e1e77f 100644 --- a/rtg.wdl +++ b/rtg.wdl @@ -24,8 +24,7 @@ task Format { input { Array[File]+ inputFiles String format = "fasta" - String outputPath = "seq_data.sdf" - + String outputPath = "reference_data" String rtgMem = "8G" String memory = "9GiB" Int timeMinutes = 1 + ceil(size(inputFiles, "GiB") * 2) @@ -41,7 +40,7 @@ task Format { } output { - File sdf = outputPath + Array[File] referenceFiles = glob("~{outputPath}/*") } runtime { @@ -61,7 +60,7 @@ task Format { dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} # outputs - sdf: {description: "RTGSequence Data File (SDF) format version of the input file(s)."} + referenceFiles: {description: "An array with all the generated reference files"} } } @@ -74,7 +73,7 @@ task VcfEval { Boolean squashPloidy = false String outputMode = "split" String outputDir = "output/" - File template + Array[File] referenceFiles Boolean allRecords = false Boolean decompose = false Boolean refOverlap = false @@ -99,7 +98,7 @@ task VcfEval { ~{"--evaluation-regions " + evaluationRegions} \ ~{"--bed-regions " + bedRegions} \ --output ~{outputDir} \ - --template ~{template} \ + --template $(dirname ~{referenceFiles[0]}) \ ~{true="--all-records" false="" allRecords} \ ~{true="--decompose" false="" decompose} \ ~{true="--ref-overlap" false="" refOverlap} \ @@ -152,7 +151,7 @@ task VcfEval { squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences.", category: "common"} outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split).", category: "advanced"} outputDir: {description: "Directory for output.", category: "advanced"} - template: {description: "SDF of the reference genome the variants are called against.", category: "required"} + referenceFiles: {description: "An array of reference Files generated by the Format task.", category: "required"} allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\").", category: "common"} decompose: {description: "decompose complex variants into smaller constituents to allow partial credit.", category: "common"} refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap).", category: "common"} From f13a7e2dbe793b2742080b91d90e42b29f6c0e6c Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 13 Nov 2024 16:47:03 +0100 Subject: [PATCH 5/6] Update parameter_meta --- gatk.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gatk.wdl b/gatk.wdl index 230674a5..655a0b66 100644 --- a/gatk.wdl +++ b/gatk.wdl @@ -1558,6 +1558,8 @@ task SelectVariants { outputPath: {description: "The location the output VCF file should be written.", category: "advanced"} intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"} selectTypeToInclude: {description: "Select only a certain type of variants from the input file.", category: "common"} + excludeFiltered: {description: "Remove all variants that do not have a PASS filter", category: "advanced"} + selectGenotype: {description: "The genotype to be selected", category: "advanced"} javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"} memory: {description: "The amount of memory this job will use.", category: "advanced"} timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} From 3c8d2e73d12d9cd3101752dff2976f86d61b4c23 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 13 Nov 2024 16:48:14 +0100 Subject: [PATCH 6/6] Update changelog --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6acbbc85..6db06e23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,14 @@ This document is user facing. Please word the changes in such a way that users understand how the changes affect the new version. --> -version 5.3.0-dev +version 6.0.0-dev --------------------------- ++ rtg Format and VcfEval tasks now handle reference as an array of files to enable caching. ++ Added --select-genotype and --exclude-filtered flags to GATK SelectVariants + Use softlinks to localise the database for centrifuge. + Added the FastqFilter task. + Added a new input `revcomp` to cutadapt to set the `--revcomp` flag, defaults to `false`. - version 5.2.0 --------------------------- + Update cutadapt version to 4.4