Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for better clinical validation pipeline #325

Merged
merged 6 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ This document is user facing. Please word the changes in such a way
that users understand how the changes affect the new version.
-->

version 5.3.0-dev
version 6.0.0-dev
---------------------------
+ rtg Format and VcfEval tasks now handle reference as an array of files to enable caching.
+ Added --select-genotype and --exclude-filtered flags to GATK SelectVariants
+ Use softlinks to localise the database for centrifuge.
+ Added the FastqFilter task.
+ Added a new input `revcomp` to cutadapt to set the `--revcomp` flag, defaults to `false`.


version 5.2.0
---------------------------
+ Update cutadapt version to 4.4
Expand Down
6 changes: 6 additions & 0 deletions gatk.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -1513,7 +1513,9 @@ task SelectVariants {
String outputPath = "output.vcf.gz"
Array[File] intervals = []

Boolean excludeFiltered = false
String? selectTypeToInclude
String? selectGenotype

String javaXmx = "4G"
String memory = "5GiB"
Expand All @@ -1529,6 +1531,8 @@ task SelectVariants {
-R ~{referenceFasta} \
-V ~{inputVcf} \
~{"--select-type-to-include " + selectTypeToInclude} \
~{"-select-genotype \"" + selectGenotype}~{true="\"" false="" defined(selectGenotype)} \
~{true="--exclude-filtered" false="" excludeFiltered} \
~{true="-L" false="" length(intervals) > 0} ~{sep=' -L ' intervals} \
-O ~{outputPath}
}
Expand All @@ -1554,6 +1558,8 @@ task SelectVariants {
outputPath: {description: "The location the output VCF file should be written.", category: "advanced"}
intervals: {description: "Bed files or interval lists describing the regions to operate on.", category: "common"}
selectTypeToInclude: {description: "Select only a certain type of variants from the input file.", category: "common"}
excludeFiltered: {description: "Remove all variants that do not have a PASS filter", category: "advanced"}
selectGenotype: {description: "The genotype to be selected", category: "advanced"}
javaXmx: {description: "The maximum memory available to the program. Should be lower than `memory` to accommodate JVM overhead.", category: "advanced"}
memory: {description: "The amount of memory this job will use.", category: "advanced"}
timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"}
Expand Down
13 changes: 6 additions & 7 deletions rtg.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ task Format {
input {
Array[File]+ inputFiles
String format = "fasta"
String outputPath = "seq_data.sdf"

String outputPath = "reference_data"
String rtgMem = "8G"
String memory = "9GiB"
Int timeMinutes = 1 + ceil(size(inputFiles, "GiB") * 2)
Expand All @@ -41,7 +40,7 @@ task Format {
}

output {
File sdf = outputPath
Array[File] referenceFiles = glob("~{outputPath}/*")
}

runtime {
Expand All @@ -61,7 +60,7 @@ task Format {
dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"}

# outputs
sdf: {description: "RTGSequence Data File (SDF) format version of the input file(s)."}
referenceFiles: {description: "An array with all the generated reference files"}
}
}

Expand All @@ -74,7 +73,7 @@ task VcfEval {
Boolean squashPloidy = false
String outputMode = "split"
String outputDir = "output/"
File template
Array[File] referenceFiles
Boolean allRecords = false
Boolean decompose = false
Boolean refOverlap = false
Expand All @@ -99,7 +98,7 @@ task VcfEval {
~{"--evaluation-regions " + evaluationRegions} \
~{"--bed-regions " + bedRegions} \
--output ~{outputDir} \
--template ~{template} \
--template $(dirname ~{referenceFiles[0]}) \
~{true="--all-records" false="" allRecords} \
~{true="--decompose" false="" decompose} \
~{true="--ref-overlap" false="" refOverlap} \
Expand Down Expand Up @@ -152,7 +151,7 @@ task VcfEval {
squashPloidy: {description: "treat heterozygous genotypes as homozygous ALT in both baseline and calls, to allow matches that ignore zygosity differences.", category: "common"}
outputMode: {description: "output reporting mode. Allowed values are [split, annotate, combine, ga4gh, roc-only] (Default is split).", category: "advanced"}
outputDir: {description: "Directory for output.", category: "advanced"}
template: {description: "SDF of the reference genome the variants are called against.", category: "required"}
referenceFiles: {description: "An array of reference Files generated by the Format task.", category: "required"}
allRecords: {description: "use all records regardless of FILTER status (Default is to only process records where FILTER is \".\" or \"PASS\").", category: "common"}
decompose: {description: "decompose complex variants into smaller constituents to allow partial credit.", category: "common"}
refOverlap: {description: "allow alleles to overlap where bases of either allele are same-as-ref (Default is to only allow VCF anchor base overlap).", category: "common"}
Expand Down
Loading