From 7528fa7525745772e70373480b35dfcfb4067d10 Mon Sep 17 00:00:00 2001 From: Sebastian Schoenherr Date: Fri, 23 Feb 2024 18:21:02 +0100 Subject: [PATCH] Add normalization and merge script --- Dockerfile | 10 ++++ bin/VariantMerger.java | 87 +++++++++++++++++++++++++++++ modules/local/mutect2.nf | 6 +- modules/local/mutserve_single.nf | 2 +- modules/local/summarize_variants.nf | 10 +++- 5 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 bin/VariantMerger.java diff --git a/Dockerfile b/Dockerfile index c141760..3b67a71 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,4 +38,14 @@ RUN wget https://github.com/genepi/haplogrep3/releases/download/v3.2.1/haplogrep rm haplogrep3-3.2.1-linux.zip ENV PATH="/opt/haplogrep:${PATH}" +WORKDIR "/opt" +RUN wget https://github.com/jbangdev/jbang/releases/download/v0.91.0/jbang-0.91.0.zip && \ + unzip -q jbang-*.zip && \ + mv jbang-0.91.0 jbang && \ + rm jbang*.zip + +ENV PATH="/opt/jbang/bin:${PATH}" +WORKDIR "/opt" +COPY ./bin/VariantMerger.java ./ +RUN jbang export portable -O=VariantMerger.jar VariantMerger.java diff --git a/bin/VariantMerger.java b/bin/VariantMerger.java new file mode 100644 index 0000000..6f90b85 --- /dev/null +++ b/bin/VariantMerger.java @@ -0,0 +1,87 @@ +//usr/bin/env jbang "$0" "$@" ; exit $? +//REPOS jcenter,genepi-maven=https://genepi.i-med.ac.at/maven +//DEPS info.picocli:picocli:4.6.1 +//DEPS genepi:genepi-io:1.1.1 + +import java.io.File; +import java.util.concurrent.Callable; +import genepi.io.table.reader.CsvTableReader; +import genepi.io.table.writer.CsvTableWriter; +import picocli.CommandLine; +import picocli.CommandLine.Option; +import picocli.CommandLine.Parameters; + +public class VariantMerger implements Callable { + + @Parameters(description = "Combined variants file") + private String file; + + @Option(names = "--output", description = "Output files", required = true) + private String output; + + @Option(names = "--indel-tag", description = "Detect indels by this tag", required = false) + private String tag = "INDEL"; + + public Integer call() throws Exception { + + assert (file != null); + assert (output != null); + + CsvTableWriter writer = new CsvTableWriter(new File(output).getAbsolutePath(), '\t', false); + CsvTableReader reader = new CsvTableReader(file, '\t'); + + int lastPos = 0; + String[] lastRow = null; + int diff = 0; + writer.setColumns(reader.getColumns()); + + while (reader.next()) { + + int pos = reader.getInteger("Pos"); + int refLength = reader.getString("Ref").length(); + int variantlength = reader.getString("Variant").length(); + + // init new position + if (lastPos == 0) { + diff = refLength - variantlength; + lastPos = pos; + lastRow = reader.getRow(); + continue; + } + + if (comparePositions(lastPos, pos, diff)) { + writer.setRow(lastRow); + writer.next(); + lastPos = 0; + } else { + // since no hit, write previous row. + writer.setRow(lastRow); + writer.next(); + + diff = refLength - variantlength; + lastPos = pos; + lastRow = reader.getRow(); + } + + } + writer.setRow(lastRow); + writer.next(); + reader.close(); + writer.close(); + return 0; + } + + public static void main(String... args) { + int exitCode = new CommandLine(new VariantMerger()).execute(args); + System.exit(exitCode); + } + + public boolean comparePositions(int lastPos, int pos, int diff) { + for (int i = 0; i <= diff; i++) { + if ((lastPos + i) == pos) { + return true; + } + } + return false; + } +} diff --git a/modules/local/mutect2.nf b/modules/local/mutect2.nf index 1ba7f14..ffa83c2 100644 --- a/modules/local/mutect2.nf +++ b/modules/local/mutect2.nf @@ -33,11 +33,13 @@ process MUTECT2 { rm raw.vcf.gz + bcftools norm -m-any -f ${reference} ${bam_file.baseName}.vcf.gz -o ${bam_file.baseName}.norm.vcf.gz -Oz + echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutect2.txt - bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\t0\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutect2.txt + bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP]\tINDEL\n' ${bam_file.baseName}.norm.vcf.gz >> ${bam_file.baseName}.mutect2.txt - awk -F'\t' 'NR == 1 || length(\$4) > 1 || length(\$5) > 1' ${bam_file.baseName}.mutect2.txt > ${bam_file.baseName}.mutect2.filtered.txt + awk -F'\t' 'NR == 1 || ((length(\$4) > 1 || length(\$5) > 1) && length(\$4) != length(\$5))' ${bam_file.baseName}.mutect2.txt > ${bam_file.baseName}.mutect2.filtered.txt """ } \ No newline at end of file diff --git a/modules/local/mutserve_single.nf b/modules/local/mutserve_single.nf index 88eba3c..0eef520 100644 --- a/modules/local/mutserve_single.nf +++ b/modules/local/mutserve_single.nf @@ -31,7 +31,7 @@ process MUTSERVE_SINGLE { ${bam_file} echo -e "ID\tFilter\tPos\tRef\tVariant\tVariantLevel\tCoverage\tType" > ${bam_file.baseName}.mutserve.txt - bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t%GT]\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt + bcftools query -f '${bam_file}\t%FILTER\t%POS\t%REF\t%ALT\t[%AF\t%DP\t]SNV\n' ${bam_file.baseName}.vcf.gz >> ${bam_file.baseName}.mutserve.txt awk -F'\t' 'NR == 1 || (length(\$4) == 1 && length(\$5) == 1)' ${bam_file.baseName}.mutserve.txt > ${bam_file.baseName}.mutserve.filtered.txt """ diff --git a/modules/local/summarize_variants.nf b/modules/local/summarize_variants.nf index 7c2370d..308c339 100644 --- a/modules/local/summarize_variants.nf +++ b/modules/local/summarize_variants.nf @@ -6,9 +6,15 @@ process SUMMARIZE_VARIANTS { path variants_txt output: - path("variants.txt"), emit: txt_summarized_ch + path("variants.fixed.txt"), emit: txt_summarized_ch """ - csvtk concat -t ${variants_txt} -T -o variants.txt + csvtk concat -t ${variants_txt} -T -o variants.concat.txt + + csvtk sort -t variants.concat.txt -k ID:N -k Pos:n -k Type -T -o variants.sorted.txt + + java -jar /opt/VariantMerger.jar \ + variants.sorted.txt \ + --output variants.fixed.txt """ } \ No newline at end of file