From 79af1d0ca34347a6013228628b6c6a450ae18850 Mon Sep 17 00:00:00 2001
From: Sebastian Schoenherr <sebastian.schoenherr@i-med.ac.at>
Date: Mon, 26 Feb 2024 16:09:16 +0100
Subject: [PATCH] Improve mode to combine MUTECT2 and mutserve

---
 modules/local/filter_variants.nf    | 27 ++++++++++++++++++++
 modules/local/merging_variants.nf   | 20 +++++++++++++++
 modules/local/mutect2.nf            |  7 +++---
 modules/local/mutserve.nf           | 27 ++++++++++++--------
 modules/local/mutserve_single.nf    | 38 -----------------------------
 modules/local/summarize_variants.nf | 20 ---------------
 reports/report.Rmd                  | 11 ++++++++-
 tests/test_mitohpc.config           |  5 ++--
 workflows/mtdna_server_2.nf         | 35 ++++++++++++++------------
 9 files changed, 100 insertions(+), 90 deletions(-)
 create mode 100644 modules/local/filter_variants.nf
 create mode 100644 modules/local/merging_variants.nf
 delete mode 100644 modules/local/mutserve_single.nf
 delete mode 100644 modules/local/summarize_variants.nf

diff --git a/modules/local/filter_variants.nf b/modules/local/filter_variants.nf
new file mode 100644
index 0000000..114ae27
--- /dev/null
+++ b/modules/local/filter_variants.nf
@@ -0,0 +1,27 @@
+process FILTER_VARIANTS {
+
+    //publishDir "${params.output}", mode: 'copy'
+
+    input:
+    tuple path(vcf_file), val(method)
+
+    output:
+    path("${vcf_file.baseName}.${method}.filtered.txt"), emit: combined_methods_ch
+
+    script:
+    def vcf_name = "${vcf_file}".replaceAll('.vcf.gz', '')
+
+    """
+    echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${vcf_file.baseName}.${method}.txt
+    
+    if [[ ${method} == "mutserve" ]]
+    then
+        bcftools query -f '${vcf_name}.bam\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t%GT]\n' ${vcf_file} >> ${vcf_file.baseName}.${method}.txt
+        awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${vcf_file.baseName}.mutserve.txt > ${vcf_file.baseName}.${method}.filtered.txt
+    else
+        bcftools query -f '${vcf_name}.bam\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n'  ${vcf_file} >> ${vcf_file.baseName}.${method}.txt
+        awk -F'\t' 'NR == 1 || ((length(\$4) > 1 || length(\$5) > 1) && length(\$4) != length(\$5))' ${vcf_file.baseName}.${method}.txt > ${vcf_file.baseName}.${method}.filtered.txt
+    fi
+    """
+}
+
diff --git a/modules/local/merging_variants.nf b/modules/local/merging_variants.nf
new file mode 100644
index 0000000..e0f3a43
--- /dev/null
+++ b/modules/local/merging_variants.nf
@@ -0,0 +1,20 @@
+process MERGING_VARIANTS {
+
+    //publishDir "${params.output}", mode: 'copy'
+
+    input:
+    path variants_txt
+
+    output:
+    path("variants.merged.txt"), emit: txt_summarized_ch
+
+    """
+    csvtk concat -t ${variants_txt} -T -o variants.concat.txt
+    
+    csvtk sort -t variants.concat.txt -k ID:N -k Pos:n -k Type:r -T -o variants.sorted.txt
+
+    java -jar /opt/VariantMerger.jar \
+        variants.sorted.txt \
+        --output variants.merged.txt
+    """
+}
\ No newline at end of file
diff --git a/modules/local/mutect2.nf b/modules/local/mutect2.nf
index ffa83c2..56d44a3 100644
--- a/modules/local/mutect2.nf
+++ b/modules/local/mutect2.nf
@@ -1,6 +1,6 @@
 process MUTECT2 {
 
-    publishDir "${params.output}", mode: 'copy'
+    //publishDir "${params.output}", mode: 'copy', pattern: 'vcf.gz'
 
     input:
     path bam_file
@@ -10,7 +10,7 @@ process MUTECT2 {
     val detected_contig
 
     output:
-    path("${bam_file.baseName}.vcf.gz"), emit: vcf_ch
+    tuple path("${bam_file.baseName}.vcf.gz"), val('mutect2'), emit: mutect2_vcf_ch
     path("${bam_file.baseName}.mutect2.txt"), emit: txt_ch
     path("${bam_file.baseName}.mutect2.filtered.txt"), emit: combined_results
 
@@ -34,10 +34,11 @@ process MUTECT2 {
     rm raw.vcf.gz
 
     bcftools norm -m-any -f ${reference} ${bam_file.baseName}.vcf.gz -o ${bam_file.baseName}.norm.vcf.gz -Oz
+    mv ${bam_file.baseName}.norm.vcf.gz ${bam_file.baseName}.vcf.gz
 
     echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutect2.txt
 
-    bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n'  ${bam_file.baseName}.norm.vcf.gz >> ${bam_file.baseName}.mutect2.txt
+    bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n'  ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutect2.txt
 
     awk -F'\t' 'NR == 1 || ((length(\$4) > 1 || length(\$5) > 1) && length(\$4) != length(\$5))' ${bam_file.baseName}.mutect2.txt > ${bam_file.baseName}.mutect2.filtered.txt
 
diff --git a/modules/local/mutserve.nf b/modules/local/mutserve.nf
index fa8a048..df98815 100644
--- a/modules/local/mutserve.nf
+++ b/modules/local/mutserve.nf
@@ -1,20 +1,20 @@
 process MUTSERVE {
 
-    publishDir "${params.output}", mode: 'copy',pattern: '*.vcf.gz'
+    //publishDir "${params.output}", mode: 'copy'
 
     input:
-    path bam_files_ch
+    path bam_file
     path reference
     path excluded_samples
 
     output:
-    path("variants.vcf.gz"), emit: vcf_ch
-    path("variants.txt"), emit: txt_ch
-
+    path("${bam_file.baseName}.mutserve.filtered.txt"), emit: combined_results
+    tuple path("${bam_file.baseName}.vcf.gz"), val('mutserve'), emit: mutserve_vcf_ch
+    tuple path("${bam_file.baseName}.vcf.gz"), path("${bam_file.baseName}.vcf.gz.tbi"), emit: mutserve_vcf_only_ch
     script:
     def avail_mem = 1024
     if (!task.memory) {
-        log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.'
+        //log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.'
     } else {
         avail_mem = (task.memory.mega*0.8).intValue()
     }    
@@ -25,10 +25,17 @@ process MUTSERVE {
     --reference ${reference} \
     --mapQ ${params.mapQ} \
     --baseQ ${params.baseQ} \
-    --deletions --output variants.vcf.gz \
-    --no-ansi ${bam_files_ch} \
+    --output ${bam_file.baseName}.vcf.gz \
+    --no-ansi \
     --excluded-samples ${excluded_samples} \
-    --threads ${task.cpus} \
-    --write-raw
+    --write-raw \
+    ${bam_file} 
+
+    tabix ${bam_file.baseName}.vcf.gz
+
+    echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutserve.txt
+    bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t]SNV\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt
+    awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${bam_file.baseName}.mutserve.txt > ${bam_file.baseName}.mutserve.filtered.txt
+   
     """
 }
\ No newline at end of file
diff --git a/modules/local/mutserve_single.nf b/modules/local/mutserve_single.nf
deleted file mode 100644
index 0eef520..0000000
--- a/modules/local/mutserve_single.nf
+++ /dev/null
@@ -1,38 +0,0 @@
-process MUTSERVE_SINGLE {
-
-    //publishDir "${params.output}", mode: 'copy'
-
-    input:
-    path bam_file
-    path reference
-    path excluded_samples
-
-    output:
-    path("${bam_file.baseName}.mutserve.filtered.txt"), emit: combined_results
-
-    script:
-    def avail_mem = 1024
-    if (!task.memory) {
-        //log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.'
-    } else {
-        avail_mem = (task.memory.mega*0.8).intValue()
-    }    
-
-    """
-    java -Xmx${avail_mem}M -jar /opt/mutserve/mutserve.jar call \
-    --level ${params.detection_limit} \
-    --reference ${reference} \
-    --mapQ ${params.mapQ} \
-    --baseQ ${params.baseQ} \
-    --output ${bam_file.baseName}.vcf.gz \
-    --no-ansi \
-    --excluded-samples ${excluded_samples} \
-    --write-raw \
-    ${bam_file} 
-
-    echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutserve.txt
-    bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t]SNV\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt
-    awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${bam_file.baseName}.mutserve.txt > ${bam_file.baseName}.mutserve.filtered.txt
-   
-    """
-}
\ No newline at end of file
diff --git a/modules/local/summarize_variants.nf b/modules/local/summarize_variants.nf
deleted file mode 100644
index 308c339..0000000
--- a/modules/local/summarize_variants.nf
+++ /dev/null
@@ -1,20 +0,0 @@
-process SUMMARIZE_VARIANTS {
-
-    publishDir "${params.output}", mode: 'copy'
-
-    input:
-    path variants_txt
-
-    output:
-    path("variants.fixed.txt"), emit: txt_summarized_ch
-
-    """
-    csvtk concat -t ${variants_txt} -T -o variants.concat.txt
-    
-    csvtk sort -t variants.concat.txt -k ID:N -k Pos:n -k Type -T -o variants.sorted.txt
-
-    java -jar /opt/VariantMerger.jar \
-        variants.sorted.txt \
-        --output variants.fixed.txt
-    """
-}
\ No newline at end of file
diff --git a/reports/report.Rmd b/reports/report.Rmd
index b55b532..349262d 100644
--- a/reports/report.Rmd
+++ b/reports/report.Rmd
@@ -230,12 +230,21 @@ Row
 
 
 ```{r echo=FALSE, results='asis'}
+variants <- variants %>%
+  mutate(Type = case_when(
+    Type == "0/1" ~ "2",
+    Type == "1/0" ~ "2",
+    Type == "INDEL" ~ "3",
+    TRUE ~ as.character(Type)
+  ))
+
 summary_data <- aggregate(  Sample_Label ~ Pos + Type , data = variants, FUN = length)
-summary_data <- summary_data %>% filter(Sample_Label >= 2)
+summary_data <- summary_data %>% filter(Sample_Label >= 1)
 summary_data <- summary_data %>% mutate(
   label = case_when(
     Type == 1 ~ "Variant",
     Type == 2 ~ "Heteroplasmy",
+    Type == 3 ~ "INDEL",
     TRUE ~ "Unknown"
   )
 )
diff --git a/tests/test_mitohpc.config b/tests/test_mitohpc.config
index 330a6a5..73d4814 100644
--- a/tests/test_mitohpc.config
+++ b/tests/test_mitohpc.config
@@ -10,8 +10,9 @@
 
 params {
     project         = "test-job"
-    files           = "/home/seb/Desktop/bams/*L2*"
+    files           = "/home/seb/Desktop/bams/*"
     output          = "out"
-    mode = "combined"
+    detection_limit = 0.03    
+    mode = "fusion"
 }
 
diff --git a/workflows/mtdna_server_2.nf b/workflows/mtdna_server_2.nf
index 08e7719..30f662d 100644
--- a/workflows/mtdna_server_2.nf
+++ b/workflows/mtdna_server_2.nf
@@ -21,13 +21,12 @@ include { INPUT_VALIDATION } from '../modules/local/input_validation'
 include { QUALITY_CONTROL } from '../modules/local/quality_control'
 include { MUTSERVE } from '../modules/local/mutserve'
 include { MUTECT2 } from '../modules/local/mutect2'
-include { SUMMARIZE_VARIANTS } from '../modules/local/summarize_variants'
+include { FILTER_VARIANTS } from '../modules/local/filter_variants'
+include { MERGING_VARIANTS } from '../modules/local/merging_variants'
 include { ANNOTATE } from '../modules/local/annotate'
 include { HAPLOGROUP_DETECTION } from '../modules/local/haplogroup_detection'
 include { CONTAMINATION_DETECTION } from '../modules/local/contamination_detection'
 include { REPORT } from '../modules/local/report'
-include { MUTSERVE_SINGLE } from '../modules/local/mutserve_single'
-
 
 
 workflow MTDNA_SERVER_2 {
@@ -76,11 +75,11 @@ workflow MTDNA_SERVER_2 {
         detected_contig
         )
 
-        SUMMARIZE_VARIANTS(
+        MERGING_VARIANTS(
         MUTECT2.out.txt_ch.collect()
         )
 
-        variants_txt_ch = SUMMARIZE_VARIANTS.out.txt_summarized_ch
+        variants_txt_ch = MERGING_VARIANTS.out.txt_summarized_ch
     }
 
     else if (params.mode == 'mutserve') {
@@ -106,29 +105,33 @@ workflow MTDNA_SERVER_2 {
          contamination_ch =  CONTAMINATION_DETECTION.out.contamination_txt_ch
     } 
 
-       else if (params.mode == 'combined') {
+    else if (params.mode == 'fusion') {
 
-        MUTSERVE_SINGLE(
+        MUTSERVE(
             bams_ch,
             ref_file_mutserve,
             INPUT_VALIDATION.out.excluded_ch
         )
 
         MUTECT2(
-        bams_ch,
-        ref_file_mutect2,
-        INPUT_VALIDATION.out.excluded_ch,
-        INDEX.out.fasta_index_ch,
-        detected_contig
+            bams_ch,
+            ref_file_mutect2,
+            INPUT_VALIDATION.out.excluded_ch,
+            INDEX.out.fasta_index_ch,
+            detected_contig
         )
         
-        merged_ch = MUTECT2.out.combined_results.collect().combine(MUTSERVE_SINGLE.out.combined_results.collect())
+        merged_ch = MUTSERVE.out.mutserve_vcf_ch.concat(MUTECT2.out.mutect2_vcf_ch)
+
+        FILTER_VARIANTS (
+            merged_ch
+        )
 
-        SUMMARIZE_VARIANTS(
-        merged_ch
+        MERGING_VARIANTS(
+            FILTER_VARIANTS.out.combined_methods_ch.collect()
         )
 
-        variants_txt_ch = SUMMARIZE_VARIANTS.out.txt_summarized_ch
+        variants_txt_ch = MERGING_VARIANTS.out.txt_summarized_ch
 
     }