From 79af1d0ca34347a6013228628b6c6a450ae18850 Mon Sep 17 00:00:00 2001 From: Sebastian Schoenherr Date: Mon, 26 Feb 2024 16:09:16 +0100 Subject: [PATCH] Improve mode to combine MUTECT2 and mutserve --- modules/local/filter_variants.nf | 27 ++++++++++++++++++++ modules/local/merging_variants.nf | 20 +++++++++++++++ modules/local/mutect2.nf | 7 +++--- modules/local/mutserve.nf | 27 ++++++++++++-------- modules/local/mutserve_single.nf | 38 ----------------------------- modules/local/summarize_variants.nf | 20 --------------- reports/report.Rmd | 11 ++++++++- tests/test_mitohpc.config | 5 ++-- workflows/mtdna_server_2.nf | 35 ++++++++++++++------------ 9 files changed, 100 insertions(+), 90 deletions(-) create mode 100644 modules/local/filter_variants.nf create mode 100644 modules/local/merging_variants.nf delete mode 100644 modules/local/mutserve_single.nf delete mode 100644 modules/local/summarize_variants.nf diff --git a/modules/local/filter_variants.nf b/modules/local/filter_variants.nf new file mode 100644 index 0000000..114ae27 --- /dev/null +++ b/modules/local/filter_variants.nf @@ -0,0 +1,27 @@ +process FILTER_VARIANTS { + + //publishDir "${params.output}", mode: 'copy' + + input: + tuple path(vcf_file), val(method) + + output: + path("${vcf_file.baseName}.${method}.filtered.txt"), emit: combined_methods_ch + + script: + def vcf_name = "${vcf_file}".replaceAll('.vcf.gz', '') + + """ + echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${vcf_file.baseName}.${method}.txt + + if [[ ${method} == "mutserve" ]] + then + bcftools query -f '${vcf_name}.bam\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t%GT]\n' ${vcf_file} >> ${vcf_file.baseName}.${method}.txt + awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${vcf_file.baseName}.mutserve.txt > ${vcf_file.baseName}.${method}.filtered.txt + else + bcftools query -f '${vcf_name}.bam\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n' ${vcf_file} >> ${vcf_file.baseName}.${method}.txt + awk -F'\t' 'NR == 1 || ((length(\$4) > 1 || length(\$5) > 1) && length(\$4) != length(\$5))' ${vcf_file.baseName}.${method}.txt > ${vcf_file.baseName}.${method}.filtered.txt + fi + """ +} + diff --git a/modules/local/merging_variants.nf b/modules/local/merging_variants.nf new file mode 100644 index 0000000..e0f3a43 --- /dev/null +++ b/modules/local/merging_variants.nf @@ -0,0 +1,20 @@ +process MERGING_VARIANTS { + + //publishDir "${params.output}", mode: 'copy' + + input: + path variants_txt + + output: + path("variants.merged.txt"), emit: txt_summarized_ch + + """ + csvtk concat -t ${variants_txt} -T -o variants.concat.txt + + csvtk sort -t variants.concat.txt -k ID:N -k Pos:n -k Type:r -T -o variants.sorted.txt + + java -jar /opt/VariantMerger.jar \ + variants.sorted.txt \ + --output variants.merged.txt + """ +} \ No newline at end of file diff --git a/modules/local/mutect2.nf b/modules/local/mutect2.nf index ffa83c2..56d44a3 100644 --- a/modules/local/mutect2.nf +++ b/modules/local/mutect2.nf @@ -1,6 +1,6 @@ process MUTECT2 { - publishDir "${params.output}", mode: 'copy' + //publishDir "${params.output}", mode: 'copy', pattern: 'vcf.gz' input: path bam_file @@ -10,7 +10,7 @@ process MUTECT2 { val detected_contig output: - path("${bam_file.baseName}.vcf.gz"), emit: vcf_ch + tuple path("${bam_file.baseName}.vcf.gz"), val('mutect2'), emit: mutect2_vcf_ch path("${bam_file.baseName}.mutect2.txt"), emit: txt_ch path("${bam_file.baseName}.mutect2.filtered.txt"), emit: combined_results @@ -34,10 +34,11 @@ process MUTECT2 { rm raw.vcf.gz bcftools norm -m-any -f ${reference} ${bam_file.baseName}.vcf.gz -o ${bam_file.baseName}.norm.vcf.gz -Oz + mv ${bam_file.baseName}.norm.vcf.gz ${bam_file.baseName}.vcf.gz echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutect2.txt - bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n' ${bam_file.baseName}.norm.vcf.gz >> ${bam_file.baseName}.mutect2.txt + bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutect2.txt awk -F'\t' 'NR == 1 || ((length(\$4) > 1 || length(\$5) > 1) && length(\$4) != length(\$5))' ${bam_file.baseName}.mutect2.txt > ${bam_file.baseName}.mutect2.filtered.txt diff --git a/modules/local/mutserve.nf b/modules/local/mutserve.nf index fa8a048..df98815 100644 --- a/modules/local/mutserve.nf +++ b/modules/local/mutserve.nf @@ -1,20 +1,20 @@ process MUTSERVE { - publishDir "${params.output}", mode: 'copy',pattern: '*.vcf.gz' + //publishDir "${params.output}", mode: 'copy' input: - path bam_files_ch + path bam_file path reference path excluded_samples output: - path("variants.vcf.gz"), emit: vcf_ch - path("variants.txt"), emit: txt_ch - + path("${bam_file.baseName}.mutserve.filtered.txt"), emit: combined_results + tuple path("${bam_file.baseName}.vcf.gz"), val('mutserve'), emit: mutserve_vcf_ch + tuple path("${bam_file.baseName}.vcf.gz"), path("${bam_file.baseName}.vcf.gz.tbi"), emit: mutserve_vcf_only_ch script: def avail_mem = 1024 if (!task.memory) { - log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.' + //log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.' } else { avail_mem = (task.memory.mega*0.8).intValue() } @@ -25,10 +25,17 @@ process MUTSERVE { --reference ${reference} \ --mapQ ${params.mapQ} \ --baseQ ${params.baseQ} \ - --deletions --output variants.vcf.gz \ - --no-ansi ${bam_files_ch} \ + --output ${bam_file.baseName}.vcf.gz \ + --no-ansi \ --excluded-samples ${excluded_samples} \ - --threads ${task.cpus} \ - --write-raw + --write-raw \ + ${bam_file} + + tabix ${bam_file.baseName}.vcf.gz + + echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutserve.txt + bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t]SNV\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt + awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${bam_file.baseName}.mutserve.txt > ${bam_file.baseName}.mutserve.filtered.txt + """ } \ No newline at end of file diff --git a/modules/local/mutserve_single.nf b/modules/local/mutserve_single.nf deleted file mode 100644 index 0eef520..0000000 --- a/modules/local/mutserve_single.nf +++ /dev/null @@ -1,38 +0,0 @@ -process MUTSERVE_SINGLE { - - //publishDir "${params.output}", mode: 'copy' - - input: - path bam_file - path reference - path excluded_samples - - output: - path("${bam_file.baseName}.mutserve.filtered.txt"), emit: combined_results - - script: - def avail_mem = 1024 - if (!task.memory) { - //log.info '[MUTSERVE] Available memory not known - defaulting to 1GB. Specify process memory requirements to change this.' - } else { - avail_mem = (task.memory.mega*0.8).intValue() - } - - """ - java -Xmx${avail_mem}M -jar /opt/mutserve/mutserve.jar call \ - --level ${params.detection_limit} \ - --reference ${reference} \ - --mapQ ${params.mapQ} \ - --baseQ ${params.baseQ} \ - --output ${bam_file.baseName}.vcf.gz \ - --no-ansi \ - --excluded-samples ${excluded_samples} \ - --write-raw \ - ${bam_file} - - echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutserve.txt - bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t]SNV\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt - awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${bam_file.baseName}.mutserve.txt > ${bam_file.baseName}.mutserve.filtered.txt - - """ -} \ No newline at end of file diff --git a/modules/local/summarize_variants.nf b/modules/local/summarize_variants.nf deleted file mode 100644 index 308c339..0000000 --- a/modules/local/summarize_variants.nf +++ /dev/null @@ -1,20 +0,0 @@ -process SUMMARIZE_VARIANTS { - - publishDir "${params.output}", mode: 'copy' - - input: - path variants_txt - - output: - path("variants.fixed.txt"), emit: txt_summarized_ch - - """ - csvtk concat -t ${variants_txt} -T -o variants.concat.txt - - csvtk sort -t variants.concat.txt -k ID:N -k Pos:n -k Type -T -o variants.sorted.txt - - java -jar /opt/VariantMerger.jar \ - variants.sorted.txt \ - --output variants.fixed.txt - """ -} \ No newline at end of file diff --git a/reports/report.Rmd b/reports/report.Rmd index b55b532..349262d 100644 --- a/reports/report.Rmd +++ b/reports/report.Rmd @@ -230,12 +230,21 @@ Row ```{r echo=FALSE, results='asis'} +variants <- variants %>% + mutate(Type = case_when( + Type == "0/1" ~ "2", + Type == "1/0" ~ "2", + Type == "INDEL" ~ "3", + TRUE ~ as.character(Type) + )) + summary_data <- aggregate( Sample_Label ~ Pos + Type , data = variants, FUN = length) -summary_data <- summary_data %>% filter(Sample_Label >= 2) +summary_data <- summary_data %>% filter(Sample_Label >= 1) summary_data <- summary_data %>% mutate( label = case_when( Type == 1 ~ "Variant", Type == 2 ~ "Heteroplasmy", + Type == 3 ~ "INDEL", TRUE ~ "Unknown" ) ) diff --git a/tests/test_mitohpc.config b/tests/test_mitohpc.config index 330a6a5..73d4814 100644 --- a/tests/test_mitohpc.config +++ b/tests/test_mitohpc.config @@ -10,8 +10,9 @@ params { project = "test-job" - files = "/home/seb/Desktop/bams/*L2*" + files = "/home/seb/Desktop/bams/*" output = "out" - mode = "combined" + detection_limit = 0.03 + mode = "fusion" } diff --git a/workflows/mtdna_server_2.nf b/workflows/mtdna_server_2.nf index 08e7719..30f662d 100644 --- a/workflows/mtdna_server_2.nf +++ b/workflows/mtdna_server_2.nf @@ -21,13 +21,12 @@ include { INPUT_VALIDATION } from '../modules/local/input_validation' include { QUALITY_CONTROL } from '../modules/local/quality_control' include { MUTSERVE } from '../modules/local/mutserve' include { MUTECT2 } from '../modules/local/mutect2' -include { SUMMARIZE_VARIANTS } from '../modules/local/summarize_variants' +include { FILTER_VARIANTS } from '../modules/local/filter_variants' +include { MERGING_VARIANTS } from '../modules/local/merging_variants' include { ANNOTATE } from '../modules/local/annotate' include { HAPLOGROUP_DETECTION } from '../modules/local/haplogroup_detection' include { CONTAMINATION_DETECTION } from '../modules/local/contamination_detection' include { REPORT } from '../modules/local/report' -include { MUTSERVE_SINGLE } from '../modules/local/mutserve_single' - workflow MTDNA_SERVER_2 { @@ -76,11 +75,11 @@ workflow MTDNA_SERVER_2 { detected_contig ) - SUMMARIZE_VARIANTS( + MERGING_VARIANTS( MUTECT2.out.txt_ch.collect() ) - variants_txt_ch = SUMMARIZE_VARIANTS.out.txt_summarized_ch + variants_txt_ch = MERGING_VARIANTS.out.txt_summarized_ch } else if (params.mode == 'mutserve') { @@ -106,29 +105,33 @@ workflow MTDNA_SERVER_2 { contamination_ch = CONTAMINATION_DETECTION.out.contamination_txt_ch } - else if (params.mode == 'combined') { + else if (params.mode == 'fusion') { - MUTSERVE_SINGLE( + MUTSERVE( bams_ch, ref_file_mutserve, INPUT_VALIDATION.out.excluded_ch ) MUTECT2( - bams_ch, - ref_file_mutect2, - INPUT_VALIDATION.out.excluded_ch, - INDEX.out.fasta_index_ch, - detected_contig + bams_ch, + ref_file_mutect2, + INPUT_VALIDATION.out.excluded_ch, + INDEX.out.fasta_index_ch, + detected_contig ) - merged_ch = MUTECT2.out.combined_results.collect().combine(MUTSERVE_SINGLE.out.combined_results.collect()) + merged_ch = MUTSERVE.out.mutserve_vcf_ch.concat(MUTECT2.out.mutect2_vcf_ch) + + FILTER_VARIANTS ( + merged_ch + ) - SUMMARIZE_VARIANTS( - merged_ch + MERGING_VARIANTS( + FILTER_VARIANTS.out.combined_methods_ch.collect() ) - variants_txt_ch = SUMMARIZE_VARIANTS.out.txt_summarized_ch + variants_txt_ch = MERGING_VARIANTS.out.txt_summarized_ch }