From bb711a6fe995a880b81dc9fb01a7d119791167fb Mon Sep 17 00:00:00 2001 From: Sebastian Schoenherr Date: Wed, 6 Mar 2024 15:09:06 +0100 Subject: [PATCH] Add subsampling module --- Dockerfile | 4 +++- cloudgene.yaml | 6 ++++++ environment.yml | 1 + modules/local/subsampling.nf | 28 ++++++++++++++++++++++++++++ nextflow.config | 1 + workflows/mtdna_server_2.nf | 9 +++++++++ 6 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 modules/local/subsampling.nf diff --git a/Dockerfile b/Dockerfile index b0d60d9..ea21355 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,9 @@ RUN apt-get update && \ zlib1g-dev \ libgomp1 \ procps \ - libx11-6 + libx11-6 \ + bc + RUN apt-get clean && rm -rf /var/lib/apt/lists/* # Install mutserve (not as conda package available) diff --git a/cloudgene.yaml b/cloudgene.yaml index a117595..2ea24da 100644 --- a/cloudgene.yaml +++ b/cloudgene.yaml @@ -105,6 +105,12 @@ workflow: on: On off: Off + - id: subsampling + description: Subsample to specified coverage (Deactived with value 0) + type: number + visible: true + value: 0 + - id: myseparator0 type: separator diff --git a/environment.yml b/environment.yml index 361615c..02c920f 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,7 @@ channels: - conda-forge - bioconda dependencies: + - conda-forge::ncurses - unzip=6.0 - openjdk=17 - r-base=4.3.2 diff --git a/modules/local/subsampling.nf b/modules/local/subsampling.nf new file mode 100644 index 0000000..d0f8ace --- /dev/null +++ b/modules/local/subsampling.nf @@ -0,0 +1,28 @@ +process SUBSAMPLING { + + input: + path bam_file + val coverage + + output: + path "${bam_file}", includeInputs: true, emit: subsampled_bam_ch + + script: + def avail_mem = 1024 + if (task.memory) { + avail_mem = (task.memory.mega*0.8).intValue() + } + + """ + samtools coverage ${bam_file} > samtools_coverage_${bam_file.baseName}.txt + csvtk grep -t -f3 -p 16569 -C '\$' samtools_coverage_${bam_file.baseName}.txt + mean_cov=\$(csvtk grep -t -f3 -p 16569 -C '\$' samtools_coverage_${bam_file.baseName}.txt | csvtk cut -t -f 7) + mean_cov_int=\$(printf "%.0f" "\$mean_cov") + fraction=\$(echo "scale=4; 1+(${coverage} / \${mean_cov})" | bc) + if [ \${mean_cov_int} -gt ${coverage} ] + then + samtools view -s \$fraction -b -o ${bam_file.baseName}.subsampled.bam ${bam_file} + mv ${bam_file.baseName}.subsampled.bam ${bam_file} + fi + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 7f075b6..4e17006 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { alignQ = 30 coverage_estimation = "on" max_samples = 0 + subsampling = 0 service = [ diff --git a/workflows/mtdna_server_2.nf b/workflows/mtdna_server_2.nf index b66862d..eadbdaf 100644 --- a/workflows/mtdna_server_2.nf +++ b/workflows/mtdna_server_2.nf @@ -28,6 +28,7 @@ include { ANNOTATE } from '../modules/local/annotate' include { HAPLOGROUPS_CONTAMINATION } from '../modules/local/haplogroups_contamination' include { COVERAGE_ESTIMATION } from '../modules/local/coverage_estimation' include { REPORT } from '../modules/local/report' +include { SUBSAMPLING } from '../modules/local/subsampling' include { SAMPLE_REPORT } from '../modules/local/sample_report' @@ -83,6 +84,14 @@ workflow MTDNA_SERVER_2 { validated_files = INPUT_VALIDATION.out.validated_files.flatten() + if(params.subsampling != 0) { + SUBSAMPLING ( + validated_files, + params.subsampling + ) + validated_files = SUBSAMPLING.out.subsampled_bam_ch + } + if (params.mode == 'mutserve') { MUTSERVE(