Merge branch 'GRIDSSfilterchange'

WEHI-ResearchComputing · Jun 26, 2024 · 7c9f6ad · 7c9f6ad
2 parents d755212 + 73afcee
commit 7c9f6ad
Show file tree

Hide file tree

Showing 5 changed files with 134 additions and 113 deletions.
diff --git a/Rtools/malDrugR/gridss_majorityfilt.R b/Rtools/malDrugR/gridss_majorityfilt.R
@@ -1,5 +1,5 @@
 ## Read gridss vcf that has been filtered with gridss_somatic_filter.R
-## and filter by requiring that a majority of resistant samples pass quality 
+## and filter by requiring that a majority of resistant samples pass quality
 ## criteria.
 ## Alternative filtering requires a majority of resistant samples to have AF
 ## greater than some critical value, and the maximum AF among resistant samples
@@ -17,139 +17,165 @@ argp <- arg_parser(paste(
   "Read an SV vcf filtered to 'somatic' events, and filter again."
 ))
 argp <- add_argument(argp, "--samplegroup",
-                     help = "Group name of related samples. Required"
+  help = "Group name of related samples. Required"
+)
+argp <- add_argument(argp, "--scriptdir",
+  help = "Path to libgridss.R script"
 )
-argp = add_argument(argp, "--scriptdir", 
-                    help="Path to libgridss.R script")
 
 argp <- add_argument(argp, "--parentlist",
-                     help = "parent filenames in a space separated string"
+  help = "parent filenames in a space separated string"
 )
 argp <- add_argument(argp, "--critsamplecount",
-                     type = "double",
-                     help = paste("Minimum number of (non-parent) samples",
-                                  "having an SV. Default is (#samples + 1)/2")
+  type = "double",
+  help = paste(
+    "Minimum number of (non-parent) samples",
+    "having an SV. Default is (#samples + 1)/2"
+  )
 )
 argv <- parse_args(argp)
 
 
-## GRIDSS function readVcf copied from "libgridss.R": 
-readVcf = function(file, ...) {
-    raw_vcf = VariantAnnotation::readVcf(file=file, ...)
-    if (!all(unlist(alt(raw_vcf)) != "")) {
-        write(
-            "Performing work-around for https://github.com/Bioconductor/VariantAnnotation/issues/8",
-            stderr())
-        alt = read_tsv(
-            file, comment="#", 
-            col_names=c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
-                        "INFO", "FORMAT", seq_len(ncol(geno(raw_vcf)[[1]]))),
-            cols_only(ALT=col_character()))$ALT
-        VariantAnnotation::fixed(raw_vcf)$ALT = CharacterList(lapply(as.character(alt), function(x) x))
-    }
-    raw_vcf = fix_parid(raw_vcf)
-    if ("MATEID" %in% names(info(raw_vcf)) & !is.null(info(raw_vcf)$MATEID)) {
-        raw_vcf = flatten_mateid(raw_vcf)
-    }
-    return(raw_vcf)
+## GRIDSS function readVcf copied from "libgridss.R":
+readVcf <- function(file, ...) {
+  raw_vcf <- VariantAnnotation::readVcf(file = file, ...)
+  if (!all(unlist(alt(raw_vcf)) != "")) {
+    write(
+      "Performing work-around for https://github.com/Bioconductor/VariantAnnotation/issues/8",
+      stderr()
+    )
+    alt <- read_tsv(
+      file,
+      comment = "#",
+      col_names = c(
+        "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
+        "INFO", "FORMAT", seq_len(ncol(geno(raw_vcf)[[1]]))
+      ),
+      cols_only(ALT = col_character())
+    )$ALT
+    VariantAnnotation::fixed(raw_vcf)$ALT <- CharacterList(lapply(as.character(alt), function(x) x))
+  }
+  raw_vcf <- fix_parid(raw_vcf)
+  if ("MATEID" %in% names(info(raw_vcf)) & !is.null(info(raw_vcf)$MATEID)) {
+    raw_vcf <- flatten_mateid(raw_vcf)
+  }
+  return(raw_vcf)
 }
-flatten_mateid = function(vcf) {
+flatten_mateid <- function(vcf) {
   if (!("MATEID" %in% names(info(vcf)))) {
     stop("Missing MATEID")
   }
-  mateid = info(vcf)$MATEID
+  mateid <- info(vcf)$MATEID
   if (any(elementNROWS(mateid)) > 1) {
     stop("Multiple MATEID for a single record not supported.")
   }
-  info(vcf)$MATEID = as.character(mateid)
+  info(vcf)$MATEID <- as.character(mateid)
   return(vcf)
 }
-fix_parid = function(vcf) {
-  if (("PARID" %in% row.names(info(header(vcf))) & !("MATEID" %in% row.names(info(header(vcf))))) |
-  		(!is.null(info(vcf)$PARID) & any(elementNROWS(info(vcf)$PARID) > 0) & (is.null(info(vcf)$MATEID) | all(elementNROWS(info(vcf)$MATEID) == 0)))) {
-    parid = info(vcf)$PARID
-    info(vcf)$PARID = NULL
-    parid_header_ordinal = which(row.names(info(header(vcf))) == "PARID")
-    row.names(info(header(vcf)))[parid_header_ordinal] = "MATEID"
-    info(header(vcf))$Description[parid_header_ordinal] = "ID of mate breakends"
-    info(vcf)$MATEID = parid
+fix_parid <- function(vcf) {
+  if (
+    ("PARID" %in% row.names(info(header(vcf))) & !("MATEID" %in% row.names(info(header(vcf))))) |
+      (!is.null(info(vcf)$PARID) & any(elementNROWS(info(vcf)$PARID) > 0) &
+        (is.null(info(vcf)$MATEID) | all(elementNROWS(info(vcf)$MATEID) == 0)))) {
+    parid <- info(vcf)$PARID
+    info(vcf)$PARID <- NULL
+    parid_header_ordinal <- which(row.names(info(header(vcf))) == "PARID")
+    row.names(info(header(vcf)))[parid_header_ordinal] <- "MATEID"
+    info(header(vcf))$Description[parid_header_ordinal] <- "ID of mate breakends"
+    info(vcf)$MATEID <- parid
     write("WARNING: MATEID header not found. Assuming VCF was generated prior to GRIDSS 2.8.0 and rewriting PARID as MATEID.", stderr())
   }
   return(vcf)
 }
 
 #### Read input vcf ####
+somvcfname <- paste0(
+  argv$samplegroup,
+  "_high_and_low_confidence_somatic.vcf.bgz"
+)
 somvcf <- tryCatch(
-  readVcf( 
-    paste0(argv$samplegroup, "_high_and_imprecise.vcf")
-    ),
-  error = function(e){
-    stop(paste(e, "when reading GRIDSS somatic vcf",
-               paste0(argv$samplegroup, "_high_and_imprecise.vcf") )
-    )
+  readVcf(somvcfname),
+  error = function(e) {
+    stop(paste(
+      e, "when reading GRIDSS somatic vcf", somvcfname
+    ))
   }
 )
 #----------- Filtering parameters -----------------
-parentlist <- str_split(argv$parentlist, ' ') |> unlist() |>
-    str_remove("_nodup.bam$")
-samplesOI <- setdiff( rownames( colData( somvcf ) ), parentlist )
+parentlist <- str_split(argv$parentlist, " ") |>
+  unlist() |>
+  str_remove("_nodup.bam$")
+samplesOI <- setdiff(rownames(colData(somvcf)), parentlist)
 ## For replicate samples, a filtering criterion is that variants are
 ## present in critSamplesSom or more of the samples.
 ## The default value is half the number of (non-parent) samples.
 if (is.na(argv$critsamplecount)) {
   critSamplesSom <- (1 + length(samplesOI)) / 2
-} else critSamplesSom <- argv$critsamplecount
+} else {
+  critSamplesSom <- argv$critsamplecount
+}
 
-BQcrit <- 200; Qcrit <- 200
+BQcrit <- 200
+Qcrit <- 200
+acceptableFilt <- c("PASS", "imprecise")
+firstFiltname <- paste0(argv$samplegroup, "_high_and_imprecise_somatic.vcf")
 
-#### Filter by event is in 'majority' of samples and not parent ####
-majsom <- somvcf[
-    apply( geno(somvcf)$BQ, 1,
-           function( Q){
-               sum( Q[samplesOI] > BQcrit)  >= critSamplesSom
-           }
-    ) &
-        apply( geno(somvcf)$QUAL, 1,
-               function( Q){
-                   sum( Q[samplesOI] > Qcrit)  >= critSamplesSom
-               }
-        ) 
-]
-writeVcf( majsom, file.path( 
-                             paste0( argv$samplegroup, "_all_majsom.vcf") 
-                          )
+AFcrit <- 0.15
+
+#### Filter by event has FILTER values in acceptable set ####
+passvcf <- VariantAnnotation::subset(somvcf, filt(somvcf) %in% acceptableFilt)
+## include mates
+passvcf <- VariantAnnotation::subset(
+  somvcf, names(somvcf) %in% c(names(passvcf), na.omit(info(passvcf)$MATEID))
 )
+rm(somvcf)
+writeVcf(passvcf, firstFiltname)
 
-AFcrit <- 0.15
-somMinAF <- somvcf[
-    apply( geno(somvcf)$AF, 1,
-           function( af){
-               sum( af[samplesOI] |> unlist() > AFcrit)  >= critSamplesSom &
-                   max( af[samplesOI] |> unlist() ) > 2* max( unlist(af[parentlist]) )
-           }
-    ) 
+#### Filter by event is in 'majority' of samples and not parent ####
+majsom <- passvcf[
+  apply(
+    geno(passvcf)$BQ, 1,
+    function(Q) {
+      sum(Q[samplesOI] > BQcrit) >= critSamplesSom
+    }
+  ) &
+    apply(
+      geno(passvcf)$QUAL, 1,
+      function(Q) {
+        sum(Q[samplesOI] > Qcrit) >= critSamplesSom
+      }
+    )
 ]
-writeVcf( somMinAF, file.path( 
-                               paste0( argv$samplegroup, "_all_somMinAF.vcf") )
-)
 
-## Apply both QUAL and AF filters
-somBothFilt <- majsom[
-    apply( geno(majsom)$AF, 1,
-           function( af){
-               sum( af[samplesOI] %>% unlist > AFcrit)  >= critSamplesSom &
-                   max( af[samplesOI] %>% unlist) > 2* max( unlist(af[parentlist]) )
-           }
-    ) 
+somMinAF <- passvcf[
+  apply(
+    geno(passvcf)$AF, 1,
+    function(af) {
+      sum(af[samplesOI] |> unlist() > AFcrit) >= critSamplesSom &
+        max(af[samplesOI] |> unlist()) > 2 * max(unlist(af[parentlist]))
+    }
+  )
 ]
 
+#### Save the union of the 2 filters as vcf and summary table ####
+somEitherFilt <- rbind(
+    majsom, somMinAF
+) |> unique() |> sort()
+writeVcf(somEitherFilt, 
+         paste0(argv$samplegroup, "_somatic_by_QUALorAF.vcf"))
+
 filtdf <- data.frame(
-    chrom = seqnames(somBothFilt),
-    start = start(somBothFilt), 
-    end = end(somBothFilt),
-    as.data.frame(geno(somBothFilt)$AF)|> unnest(cols = everything()) |>
-      rename_with(~ paste0("AF_", .x)),
-    as.data.frame(geno(somBothFilt)$QUAL) |> rename_with(~ paste0("QUAL_", .x))
-    )
-write_tsv(filtdf,
-            file.path( paste0( argv$samplegroup, "_bothfilters.tsv") ))
+  gridssID = rownames(somEitherFilt),
+  chrom = seqnames(somEitherFilt),
+  pos = start(somEitherFilt),
+  REF = ref(somEitherFilt) |> unlist(),
+  ALT = alt(somEitherFilt) |> unlist() |> str_trunc(width = 24, side = "right"),
+  as.data.frame(geno(somEitherFilt)$AF) |> unnest(cols = everything()) |>
+    rename_with(~ paste0("AF_", .x)),
+  as.data.frame(geno(somEitherFilt)$QUAL) |> rename_with(~ paste0("QUAL_", .x))
+) |>
+    arrange(gridssID)
+write_tsv(
+  filtdf,
+  file.path(paste0(argv$samplegroup, "_somatic_by_QUALorAF.tsv"))
+)
diff --git a/config/modules.config b/config/modules.config
@@ -24,7 +24,7 @@ process {
     withLabel:IndexDedup {
         queue='regular'
         cpus = 5
-        memory={ 10.GB * task.attempt }
+        memory={ 32.GB * task.attempt }
         time='8h'
         errorStrategy ={ 'retry'  }
         maxRetries= 5

diff --git a/main.nf b/main.nf
@@ -23,7 +23,7 @@ include { SomaticFilter } from './modules/stvariant.nf'
 include { RCopyNum } from './modules/stvariant.nf'
 include { InstallR } from './modules/stvariant.nf'
 include { FilterBcfVcf } from './modules/stvariant.nf'
-include { MajorityFilter } from './modules/stvariant.nf'
+include { FilterGridssSV } from './modules/stvariant.nf'
 include { RPlotFull } from './modules/stvariant.nf'
 include { RPlotROI } from './modules/stvariant.nf'
 
@@ -205,14 +205,14 @@ workflow {
                             // path(vcf), 
                             // path(bams), path(bai), val(dummy)
     FilterBcfVcf(fbcf_ch )            
-    //----------------------Majority filter----------------------------------- 
+    //----------------------GRIDSS filter----------------------------------- 
     input_ch.map{row -> tuple(row[0],row[4])}
             .unique()
             .combine(parent_ch.map{row->tuple(row[2],row[0])},by:1)
             .map{row->tuple(row[1],row[2])}
             .join(sfilter_ch.vcf)
-            .set{mjf_ch} // Emits val(groupId),val(parentlist), path(vcf), val(dummy)
-    MajorityFilter(mjf_ch)
+            .set{gf_ch} // Emits val(groupId),val(parentlist), path(vcf), val(dummy)
+    FilterGridssSV(gf_ch)
     //----------------------Plot-----------------------------------------
     // Input Channel Emits val(groupId),val(parentId),val(refpath),val(prefix),path(rds)
     input_ch.map{row -> tuple(row[0],row[4])}

diff --git a/modules/stvariant.nf b/modules/stvariant.nf
@@ -78,7 +78,7 @@ process SomaticFilter{
             path(vcf)
 
     output:
-    tuple val(groupId),path("${groupId}_high_and_imprecise.vcf"), emit:vcf
+    tuple val(groupId),path("${groupId}_high_and_low_confidence_somatic.vcf.bgz"), emit:vcf
     path("output.txt")
 
     script:
@@ -89,21 +89,13 @@ process SomaticFilter{
     samplecount=\$(wc -l < ${groupId}_bams.txt)
     tumourordinals=\$(seq -s \' \' \$(expr \$parentcount + 1) \$samplecount)
    
-    echo "Start Somatic filter."
     Rscript --vanilla ${projectDir}/Rtools/gridss_assets/gridss_somatic_filter.R \
         --input ${groupId}.vcf \
-        --fulloutput ${groupId}_high_and_low_confidence_somatic.vcf.bgz \
+        --fulloutput ${groupId}_high_and_low_confidence_somatic.vcf \
         --scriptdir ${projectDir}/Rtools/gridss_assets/  ##\$(dirname \$(which gridss_somatic_filter))\
         --ref ${bsref}  \
         --normalordinal 1  --tumourordinal \$tumourordinals
-    
-    gzip -dc ${groupId}_high_and_low_confidence_somatic.vcf.bgz.bgz | awk  \
-    \'/^#/ || \$7 ~ /^PASS\$/ || \$7 ~ /^imprecise\$/' >  \
-    ${groupId}_high_and_imprecise.vcf
-
-    echo "GRIDSS somatic filter used 1st line of ${groupId}_bams.txt as Normal sample" > output.txt
-    echo "and lines \$tumourordinals as 'tumour' samples." >> output.txt
-    
+        
     """
 }
 process RCopyNum {
@@ -165,7 +157,7 @@ process FilterBcfVcf {
     """
 }
 
-process MajorityFilter {
+process FilterGridssSV {
     label 'Rfilter'    
     tag "${groupId}"   
     publishDir "${params.outdir}/variants/SVs", mode: 'copy'
@@ -175,7 +167,7 @@ process MajorityFilter {
     tuple  val(groupId),val(parentbamlist), path(vcf)
 
     output:
-    tuple val(groupId), path("${groupId}_all_*.vcf"), emit: vcf
+    tuple val(groupId), path("${groupId}_*.vcf"), emit: vcf
     tuple val(groupId), path("${groupId}*.tsv"), emit: txt 
     script:
     """

diff --git a/nextflow.config b/nextflow.config
@@ -52,3 +52,6 @@ profiles {
 
 // Load modules.config for DSL2 module specific options
 includeConfig 'config/modules.config'
+
+cache = 'lenient'
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -52,3 +52,6 @@ profiles {

		// Load modules.config for DSL2 module specific options
		includeConfig 'config/modules.config'

		cache = 'lenient'