Skip to content

Commit

Permalink
Merge branch 'GRIDSSfilterchange'
Browse files Browse the repository at this point in the history
  • Loading branch information
JocelynSP committed Jun 26, 2024
2 parents d755212 + 73afcee commit 7c9f6ad
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 113 deletions.
216 changes: 121 additions & 95 deletions Rtools/malDrugR/gridss_majorityfilt.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## Read gridss vcf that has been filtered with gridss_somatic_filter.R
## and filter by requiring that a majority of resistant samples pass quality
## and filter by requiring that a majority of resistant samples pass quality
## criteria.
## Alternative filtering requires a majority of resistant samples to have AF
## greater than some critical value, and the maximum AF among resistant samples
Expand All @@ -17,139 +17,165 @@ argp <- arg_parser(paste(
"Read an SV vcf filtered to 'somatic' events, and filter again."
))
argp <- add_argument(argp, "--samplegroup",
help = "Group name of related samples. Required"
help = "Group name of related samples. Required"
)
argp <- add_argument(argp, "--scriptdir",
help = "Path to libgridss.R script"
)
argp = add_argument(argp, "--scriptdir",
help="Path to libgridss.R script")

argp <- add_argument(argp, "--parentlist",
help = "parent filenames in a space separated string"
help = "parent filenames in a space separated string"
)
argp <- add_argument(argp, "--critsamplecount",
type = "double",
help = paste("Minimum number of (non-parent) samples",
"having an SV. Default is (#samples + 1)/2")
type = "double",
help = paste(
"Minimum number of (non-parent) samples",
"having an SV. Default is (#samples + 1)/2"
)
)
argv <- parse_args(argp)


## GRIDSS function readVcf copied from "libgridss.R":
readVcf = function(file, ...) {
raw_vcf = VariantAnnotation::readVcf(file=file, ...)
if (!all(unlist(alt(raw_vcf)) != "")) {
write(
"Performing work-around for https://github.com/Bioconductor/VariantAnnotation/issues/8",
stderr())
alt = read_tsv(
file, comment="#",
col_names=c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
"INFO", "FORMAT", seq_len(ncol(geno(raw_vcf)[[1]]))),
cols_only(ALT=col_character()))$ALT
VariantAnnotation::fixed(raw_vcf)$ALT = CharacterList(lapply(as.character(alt), function(x) x))
}
raw_vcf = fix_parid(raw_vcf)
if ("MATEID" %in% names(info(raw_vcf)) & !is.null(info(raw_vcf)$MATEID)) {
raw_vcf = flatten_mateid(raw_vcf)
}
return(raw_vcf)
## GRIDSS function readVcf copied from "libgridss.R":
readVcf <- function(file, ...) {
raw_vcf <- VariantAnnotation::readVcf(file = file, ...)
if (!all(unlist(alt(raw_vcf)) != "")) {
write(
"Performing work-around for https://github.com/Bioconductor/VariantAnnotation/issues/8",
stderr()
)
alt <- read_tsv(
file,
comment = "#",
col_names = c(
"CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
"INFO", "FORMAT", seq_len(ncol(geno(raw_vcf)[[1]]))
),
cols_only(ALT = col_character())
)$ALT
VariantAnnotation::fixed(raw_vcf)$ALT <- CharacterList(lapply(as.character(alt), function(x) x))
}
raw_vcf <- fix_parid(raw_vcf)
if ("MATEID" %in% names(info(raw_vcf)) & !is.null(info(raw_vcf)$MATEID)) {
raw_vcf <- flatten_mateid(raw_vcf)
}
return(raw_vcf)
}
flatten_mateid = function(vcf) {
flatten_mateid <- function(vcf) {
if (!("MATEID" %in% names(info(vcf)))) {
stop("Missing MATEID")
}
mateid = info(vcf)$MATEID
mateid <- info(vcf)$MATEID
if (any(elementNROWS(mateid)) > 1) {
stop("Multiple MATEID for a single record not supported.")
}
info(vcf)$MATEID = as.character(mateid)
info(vcf)$MATEID <- as.character(mateid)
return(vcf)
}
fix_parid = function(vcf) {
if (("PARID" %in% row.names(info(header(vcf))) & !("MATEID" %in% row.names(info(header(vcf))))) |
(!is.null(info(vcf)$PARID) & any(elementNROWS(info(vcf)$PARID) > 0) & (is.null(info(vcf)$MATEID) | all(elementNROWS(info(vcf)$MATEID) == 0)))) {
parid = info(vcf)$PARID
info(vcf)$PARID = NULL
parid_header_ordinal = which(row.names(info(header(vcf))) == "PARID")
row.names(info(header(vcf)))[parid_header_ordinal] = "MATEID"
info(header(vcf))$Description[parid_header_ordinal] = "ID of mate breakends"
info(vcf)$MATEID = parid
fix_parid <- function(vcf) {
if (
("PARID" %in% row.names(info(header(vcf))) & !("MATEID" %in% row.names(info(header(vcf))))) |
(!is.null(info(vcf)$PARID) & any(elementNROWS(info(vcf)$PARID) > 0) &
(is.null(info(vcf)$MATEID) | all(elementNROWS(info(vcf)$MATEID) == 0)))) {
parid <- info(vcf)$PARID
info(vcf)$PARID <- NULL
parid_header_ordinal <- which(row.names(info(header(vcf))) == "PARID")
row.names(info(header(vcf)))[parid_header_ordinal] <- "MATEID"
info(header(vcf))$Description[parid_header_ordinal] <- "ID of mate breakends"
info(vcf)$MATEID <- parid
write("WARNING: MATEID header not found. Assuming VCF was generated prior to GRIDSS 2.8.0 and rewriting PARID as MATEID.", stderr())
}
return(vcf)
}

#### Read input vcf ####
somvcfname <- paste0(
argv$samplegroup,
"_high_and_low_confidence_somatic.vcf.bgz"
)
somvcf <- tryCatch(
readVcf(
paste0(argv$samplegroup, "_high_and_imprecise.vcf")
),
error = function(e){
stop(paste(e, "when reading GRIDSS somatic vcf",
paste0(argv$samplegroup, "_high_and_imprecise.vcf") )
)
readVcf(somvcfname),
error = function(e) {
stop(paste(
e, "when reading GRIDSS somatic vcf", somvcfname
))
}
)
#----------- Filtering parameters -----------------
parentlist <- str_split(argv$parentlist, ' ') |> unlist() |>
str_remove("_nodup.bam$")
samplesOI <- setdiff( rownames( colData( somvcf ) ), parentlist )
parentlist <- str_split(argv$parentlist, " ") |>
unlist() |>
str_remove("_nodup.bam$")
samplesOI <- setdiff(rownames(colData(somvcf)), parentlist)
## For replicate samples, a filtering criterion is that variants are
## present in critSamplesSom or more of the samples.
## The default value is half the number of (non-parent) samples.
if (is.na(argv$critsamplecount)) {
critSamplesSom <- (1 + length(samplesOI)) / 2
} else critSamplesSom <- argv$critsamplecount
} else {
critSamplesSom <- argv$critsamplecount
}

BQcrit <- 200; Qcrit <- 200
BQcrit <- 200
Qcrit <- 200
acceptableFilt <- c("PASS", "imprecise")
firstFiltname <- paste0(argv$samplegroup, "_high_and_imprecise_somatic.vcf")

#### Filter by event is in 'majority' of samples and not parent ####
majsom <- somvcf[
apply( geno(somvcf)$BQ, 1,
function( Q){
sum( Q[samplesOI] > BQcrit) >= critSamplesSom
}
) &
apply( geno(somvcf)$QUAL, 1,
function( Q){
sum( Q[samplesOI] > Qcrit) >= critSamplesSom
}
)
]
writeVcf( majsom, file.path(
paste0( argv$samplegroup, "_all_majsom.vcf")
)
AFcrit <- 0.15

#### Filter by event has FILTER values in acceptable set ####
passvcf <- VariantAnnotation::subset(somvcf, filt(somvcf) %in% acceptableFilt)
## include mates
passvcf <- VariantAnnotation::subset(
somvcf, names(somvcf) %in% c(names(passvcf), na.omit(info(passvcf)$MATEID))
)
rm(somvcf)
writeVcf(passvcf, firstFiltname)

AFcrit <- 0.15
somMinAF <- somvcf[
apply( geno(somvcf)$AF, 1,
function( af){
sum( af[samplesOI] |> unlist() > AFcrit) >= critSamplesSom &
max( af[samplesOI] |> unlist() ) > 2* max( unlist(af[parentlist]) )
}
)
#### Filter by event is in 'majority' of samples and not parent ####
majsom <- passvcf[
apply(
geno(passvcf)$BQ, 1,
function(Q) {
sum(Q[samplesOI] > BQcrit) >= critSamplesSom
}
) &
apply(
geno(passvcf)$QUAL, 1,
function(Q) {
sum(Q[samplesOI] > Qcrit) >= critSamplesSom
}
)
]
writeVcf( somMinAF, file.path(
paste0( argv$samplegroup, "_all_somMinAF.vcf") )
)

## Apply both QUAL and AF filters
somBothFilt <- majsom[
apply( geno(majsom)$AF, 1,
function( af){
sum( af[samplesOI] %>% unlist > AFcrit) >= critSamplesSom &
max( af[samplesOI] %>% unlist) > 2* max( unlist(af[parentlist]) )
}
)
somMinAF <- passvcf[
apply(
geno(passvcf)$AF, 1,
function(af) {
sum(af[samplesOI] |> unlist() > AFcrit) >= critSamplesSom &
max(af[samplesOI] |> unlist()) > 2 * max(unlist(af[parentlist]))
}
)
]

#### Save the union of the 2 filters as vcf and summary table ####
somEitherFilt <- rbind(
majsom, somMinAF
) |> unique() |> sort()
writeVcf(somEitherFilt,
paste0(argv$samplegroup, "_somatic_by_QUALorAF.vcf"))

filtdf <- data.frame(
chrom = seqnames(somBothFilt),
start = start(somBothFilt),
end = end(somBothFilt),
as.data.frame(geno(somBothFilt)$AF)|> unnest(cols = everything()) |>
rename_with(~ paste0("AF_", .x)),
as.data.frame(geno(somBothFilt)$QUAL) |> rename_with(~ paste0("QUAL_", .x))
)
write_tsv(filtdf,
file.path( paste0( argv$samplegroup, "_bothfilters.tsv") ))
gridssID = rownames(somEitherFilt),
chrom = seqnames(somEitherFilt),
pos = start(somEitherFilt),
REF = ref(somEitherFilt) |> unlist(),
ALT = alt(somEitherFilt) |> unlist() |> str_trunc(width = 24, side = "right"),
as.data.frame(geno(somEitherFilt)$AF) |> unnest(cols = everything()) |>
rename_with(~ paste0("AF_", .x)),
as.data.frame(geno(somEitherFilt)$QUAL) |> rename_with(~ paste0("QUAL_", .x))
) |>
arrange(gridssID)
write_tsv(
filtdf,
file.path(paste0(argv$samplegroup, "_somatic_by_QUALorAF.tsv"))
)
2 changes: 1 addition & 1 deletion config/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ process {
withLabel:IndexDedup {
queue='regular'
cpus = 5
memory={ 10.GB * task.attempt }
memory={ 32.GB * task.attempt }
time='8h'
errorStrategy ={ 'retry' }
maxRetries= 5
Expand Down
8 changes: 4 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ include { SomaticFilter } from './modules/stvariant.nf'
include { RCopyNum } from './modules/stvariant.nf'
include { InstallR } from './modules/stvariant.nf'
include { FilterBcfVcf } from './modules/stvariant.nf'
include { MajorityFilter } from './modules/stvariant.nf'
include { FilterGridssSV } from './modules/stvariant.nf'
include { RPlotFull } from './modules/stvariant.nf'
include { RPlotROI } from './modules/stvariant.nf'

Expand Down Expand Up @@ -205,14 +205,14 @@ workflow {
// path(vcf),
// path(bams), path(bai), val(dummy)
FilterBcfVcf(fbcf_ch )
//----------------------Majority filter-----------------------------------
//----------------------GRIDSS filter-----------------------------------
input_ch.map{row -> tuple(row[0],row[4])}
.unique()
.combine(parent_ch.map{row->tuple(row[2],row[0])},by:1)
.map{row->tuple(row[1],row[2])}
.join(sfilter_ch.vcf)
.set{mjf_ch} // Emits val(groupId),val(parentlist), path(vcf), val(dummy)
MajorityFilter(mjf_ch)
.set{gf_ch} // Emits val(groupId),val(parentlist), path(vcf), val(dummy)
FilterGridssSV(gf_ch)
//----------------------Plot-----------------------------------------
// Input Channel Emits val(groupId),val(parentId),val(refpath),val(prefix),path(rds)
input_ch.map{row -> tuple(row[0],row[4])}
Expand Down
18 changes: 5 additions & 13 deletions modules/stvariant.nf
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ process SomaticFilter{
path(vcf)

output:
tuple val(groupId),path("${groupId}_high_and_imprecise.vcf"), emit:vcf
tuple val(groupId),path("${groupId}_high_and_low_confidence_somatic.vcf.bgz"), emit:vcf
path("output.txt")

script:
Expand All @@ -89,21 +89,13 @@ process SomaticFilter{
samplecount=\$(wc -l < ${groupId}_bams.txt)
tumourordinals=\$(seq -s \' \' \$(expr \$parentcount + 1) \$samplecount)
echo "Start Somatic filter."
Rscript --vanilla ${projectDir}/Rtools/gridss_assets/gridss_somatic_filter.R \
--input ${groupId}.vcf \
--fulloutput ${groupId}_high_and_low_confidence_somatic.vcf.bgz \
--fulloutput ${groupId}_high_and_low_confidence_somatic.vcf \
--scriptdir ${projectDir}/Rtools/gridss_assets/ ##\$(dirname \$(which gridss_somatic_filter))\
--ref ${bsref} \
--normalordinal 1 --tumourordinal \$tumourordinals
gzip -dc ${groupId}_high_and_low_confidence_somatic.vcf.bgz.bgz | awk \
\'/^#/ || \$7 ~ /^PASS\$/ || \$7 ~ /^imprecise\$/' > \
${groupId}_high_and_imprecise.vcf
echo "GRIDSS somatic filter used 1st line of ${groupId}_bams.txt as Normal sample" > output.txt
echo "and lines \$tumourordinals as 'tumour' samples." >> output.txt
"""
}
process RCopyNum {
Expand Down Expand Up @@ -165,7 +157,7 @@ process FilterBcfVcf {
"""
}

process MajorityFilter {
process FilterGridssSV {
label 'Rfilter'
tag "${groupId}"
publishDir "${params.outdir}/variants/SVs", mode: 'copy'
Expand All @@ -175,7 +167,7 @@ process MajorityFilter {
tuple val(groupId),val(parentbamlist), path(vcf)

output:
tuple val(groupId), path("${groupId}_all_*.vcf"), emit: vcf
tuple val(groupId), path("${groupId}_*.vcf"), emit: vcf
tuple val(groupId), path("${groupId}*.tsv"), emit: txt
script:
"""
Expand Down
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ profiles {

// Load modules.config for DSL2 module specific options
includeConfig 'config/modules.config'

cache = 'lenient'

0 comments on commit 7c9f6ad

Please sign in to comment.