Merge pull request #69 from genepi/nf-test

Nf test
genepi · Jun 20, 2024 · 84c8f96 · 84c8f96
2 parents 2628ab6 + e80c842
commit 84c8f96
Show file tree

Hide file tree

Showing 119 changed files with 72,809 additions and 92 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -0,0 +1,35 @@
+name: CI Tests
+
+on: [push, pull_request]
+
+jobs:
+
+  test:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        shard: [1, 2, 3, 4, 5]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up JDK 11
+        uses: actions/setup-java@v2
+        with:
+          java-version: '11'
+          distribution: 'adopt'
+
+      - name: Setup Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "latest-edge"
+
+      - name: Install nf-test
+        run: |
+          wget -qO- get.nf-test.com | bash  -s 0.9.0-rc2
+          sudo mv nf-test /usr/local/bin/
+
+      - name: Run Tests (Shard ${{ matrix.shard }}/${{ strategy.job-total }})
+        run: nf-test test --ci --shard ${{ matrix.shard }}/${{ strategy.job-total }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+work
+.nextflow
+.nextflow.*
+tests/output
+.nf-test/
+nf-test
+.nf-test.log
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,57 @@
+cff-version: "1.2.0"
+message: "If you use this software, please cite it as below."
+title: "Nanopore sequencing with unique molecular identifiers enables accurate mutation analysis and haplotyping in the complex Lipoprotein(a) KIV-2 VNTR"
+authors:
+  - family-names: "Amstler"
+    given-names: "Stephan"
+  - family-names: "Streiter"
+    given-names: "Gertraud"
+  - family-names: "Pfurtscheller"
+    given-names: "Cathrin"
+  - family-names: "Forer"
+    given-names: "Lukas"
+  - family-names: "Di Maio"
+    given-names: "Silvia"
+  - family-names: "Weissensteiner"
+    given-names: "Hansi"
+  - family-names: "Paulweber"
+    given-names: "Bernhard"
+  - family-names: "Schoenherr"
+    given-names: "Sebastian"
+  - family-names: "Kronenberg"
+    given-names: "Florian"
+  - family-names: "Coassin"
+    given-names: "Stefan"
+doi: "10.1101/2024.03.01.582741"
+date-released: "2024-03-05"
+license: "Apache-2.0"
+repository-code: "https://github.com/genepi/umi-pipeline-nf"
+preferred-citation:
+  type: "article"
+  authors:
+  - family-names: "Amstler"
+    given-names: "Stephan"
+  - family-names: "Streiter"
+    given-names: "Gertraud"
+  - family-names: "Pfurtscheller"
+    given-names: "Cathrin"
+  - family-names: "Forer"
+    given-names: "Lukas"
+  - family-names: "Di Maio"
+    given-names: "Silvia"
+  - family-names: "Weissensteiner"
+    given-names: "Hansi"
+  - family-names: "Paulweber"
+    given-names: "Bernhard"
+  - family-names: "Schoenherr"
+    given-names: "Sebastian"
+  - family-names: "Kronenberg"
+    given-names: "Florian"
+  - family-names: "Coassin"
+    given-names: "Stefan"
+  doi: "10.1101/2024.03.01.582741"
+  journal: "bioRxiv"
+  day: 5
+  month: 3
+  title: "Nanopore sequencing with unique molecular identifiers enables accurate mutation analysis and haplotyping in the complex Lipoprotein(a) KIV-2 VNTR"
+  year: 2024
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Umi-pipeline-nf
 
 **Umi-pipeline-nf** creates highly accurate single-molecule consensus sequences for unique molecular identifier (UMI)-tagged amplicons from nanopore sequencing data.  
 The pipeline can be run for the whole fastq_pass folder of your nanopore run and, per default, outputs the aligned consensus sequences of each UMI cluster in bam file. The optional variant calling creates a vcf file for all variants that are found in the consensus sequences.
-umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI analysis pipeline](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We migrated the pipeline in [Nextflow](https://www.nextflow.io), included several optimizations and [additional functionalities](#main-adaptations).  
+Umi-pipeline-nf orignates from a snakemake-based analysis pipeline ([pipeline-umi-amplicon](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We migrated the pipeline to [Nextflow](https://www.nextflow.io) and included several optimizations and [additional functionalities](#main-adaptations).  
 
 ![Workflow](docs/images/umi-pipeline-nf_metro-map.svg)
 
@@ -43,20 +43,26 @@ umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI ana
 2. Download the pipeline and test it on a [minimal dataset](data/info.txt) with a single command.
 
 ```bash
-nextflow run genepi/umi-pipeline-nf -r v0.1.0 -profile test,docker
+nextflow run genepi/umi-pipeline-nf -r v0.2.1 -profile test,docker
 ```
 
 3. Start running your own analysis!  
 3.1 Download and adapt the config/custom.config with paths to your data (relative and absolute paths possible).
 
 ```bash
-nextflow run genepi/umi-pipeline-nf -r v0.1.0 -c <custom.config> -profile docker 
+nextflow run genepi/umi-pipeline-nf -r v0.2.1 -c <custom.config> -profile custom,<docker,singularity> 
 ```
 
+## Citation 
+
+If you use the pipeline please cite [our Paper](https://www.biorxiv.org/content/10.1101/2024.03.01.582741v1):
+
+Amstler S, Streiter G, Pfurtscheller C, Forer L, Di Maio S, Weissensteiner H, Paulweber B, Schoenherr S, Kronenberg F, Coassin S. Nanopore sequencing with unique molecular identifiers enables accurate mutation analysis and haplotyping in the complex Lipoprotein(a) KIV-2 VNTR. bioRxiv. 2024. doi: 10.1101/2024.03.01.582741.
+
 
 ### Credits
 
 The pipeline was written by ([@StephanAmstler](https://github.com/AmstlerStephan)).  
 Nextflow template pipeline: [EcSeq](https://github.com/ecSeq).  
-Snakemake-based ONT pipeline: [nanoporetech/pipeline-umi-amplicon](https://github.com/nanoporetech/pipeline-umi-amplicon).  
-Original workflow: [SorenKarst/longread_umi](https://github.com/SorenKarst/longread_umi).
+Snakemake-based ONT pipeline for UMI nanopore sequencing analysis: [nanoporetech/pipeline-umi-amplicon](https://github.com/nanoporetech/pipeline-umi-amplicon).  
+UMI-corrected nanopore sequencing analysis first shown by: [SorenKarst/longread_umi](https://github.com/SorenKarst/longread_umi).
diff --git a/bin/extract_umis.py b/bin/extract_umis.py
@@ -1,3 +1,8 @@
+"""
+This is a modified version of the code present in:
+https://github.com/nanoporetech/pipeline-umi-amplicon/blob/master/lib/umi_amplicon_tools/extract_umis.py
+"""
+
 import argparse
 import logging
 import os

diff --git a/bin/filter_reads.py b/bin/filter_reads.py
@@ -1,3 +1,7 @@
+"""
+This is a modified version of the code present in:
+https://github.com/nanoporetech/pipeline-umi-amplicon/blob/master/lib/umi_amplicon_tools/filter_reads.py
+"""
 import argparse
 import logging
 import os

diff --git a/bin/parse_clusters.py b/bin/parse_clusters.py
@@ -1,3 +1,8 @@
+"""
+This is a modified version of the code present in:
+https://github.com/nanoporetech/pipeline-umi-amplicon/blob/master/lib/umi_amplicon_tools/parse_clusters.py
+"""
+
 import argparse
 import logging
 import os

diff --git a/bin/reformat_consensus.py b/bin/reformat_consensus.py
@@ -1,3 +1,8 @@
+"""
+This is a modified version of the code present in:
+https://github.com/nanoporetech/pipeline-umi-amplicon/blob/master/lib/umi_amplicon_tools/reformat_consensus.py
+"""
+
 import argparse
 import logging
 import sys

diff --git a/config/base.config b/config/base.config
@@ -4,10 +4,10 @@
 // PROCESS RESOURCES
 process {
 	withName: "POLISH_CLUSTER" {
-		memory = { 10.GB * task.attempt }
-		cpus = 2
+		memory = { 2.GB * task.attempt }
+		cpus = 1
 	}
 
 	errorStrategy =  'retry'
-	maxRetries = 3
+	maxRetries = 5
 }
diff --git a/config/custom.config b/config/custom.config
@@ -7,27 +7,45 @@
 
 params {
 
-	help 						= false
-	version 					= false
-	debug 						= false
+	help 			= false
+	version 		= false
+	debug 			= false
 
 	// required parameters
 
-	input 						= "PATH/TO/fastq_pass/"
-	output 						= "PATH/TO/OUTPUT_DIR"
-	reference 					= "PATH/TO/REF.fasta"
-	reference_fai 				= "PATH/TO/REF.fasta.fai"
-	bed 						= "PATH/TO/BED.bed"
+	input 			= "PATH/TO/fastq_pass/"
+	output 			= "PATH/TO/OUTPUT_DIR"
+	reference 		= "PATH/TO/REF.fasta"
+	reference_fai 		= "PATH/TO/REF.fasta.fai"
+	bed 			= "PATH/TO/BED.bed"
 
 	// adaptable parameters
 
-	output_format 				= "fastq"	
-	filter_strategy_clusters 	= "quality"
+    //READ FILTERING
+    min_read_length             = 0
+    min_qscore                  = 10
 
-	call_variants 				= true
-	variant_caller 				= "freebayes"
+    // SUBSAMPLING
+    subsampling                 = false
+    subsampling_seed            = 11
+    subsampling_readnumber      = 100000
+
+    // VARIANT_CALLING
+    call_variants               = false    
+    variant_caller              = "freebayes"
 
-	medaka_model 				= "r1041_e82_400bps_hac_g615"
+    // ADVANCED
+    min_reads_per_barcode       = 1000
+    umi_errors                  = 2
+    max_dist_umi                = 2
+    min_reads_per_cluster       = 20
+    max_reads_per_cluster       = 60
+    min_consensus_quality   	= 40
+    masking_strategy        	= "softmask"
+    filter_strategy_clusters    = "quality"
+    min_overlap                 = 0.95
+    balance_strands             = true
+    medaka_model                = "r1041_e82_400bps_hac_g615"
 }
 
 // NEXTFLOW REPORTING

diff --git a/config/test.config b/config/test.config
@@ -8,28 +8,21 @@
 
 params {
 
-	help 				= false
-	version 			= false
-	debug 				= true
+	help 			= false
+	version 		= false
+	debug 			= false
 
-	input 				= "$baseDir/data/fastq_pass/"
-	output 				= "umi-pipeline-nf_test-run"
-	reference 			= "$baseDir/data/ref/lpa-ref2645.fasta"
-	reference_fai 			= "$baseDir/data/ref/lpa-ref2645.fasta.fai"
-	bed 				= "$baseDir/data/ref/lpa-ref2645.bed"
+	input 			= "$baseDir/tests/input/pipeline/fastq_pass/"
+	output 			= "test_umi-pipeline-nf"
+	reference 		= "$baseDir/tests/input/pipeline/ref/lpa-ref2645.fasta"
+	reference_fai 	= "$baseDir/tests/input/pipeline/ref/lpa-ref2645.fasta.fai"
+	bed 			= "$baseDir/tests/input/pipeline/ref/lpa-ref2645.bed"
 
-	subsampling 			= false
-
-	min_reads_per_cluster 		= 10
-	max_reads_per_cluster 		= 20
-
-	write_reports 			= true
-	output_format 			= "fastq"
-	filter_strategy_clusters 	= "quality"
-	call_variants 			= true
-	variant_caller 			= "freebayes"
-
-	medaka_model 			= "r1041_e82_400bps_hac_g615"
+	min_reads_per_cluster 	= 10
+	max_reads_per_cluster 	= 20
+	min_reads_per_barcode = 0
+	call_variants 		= true
+	variant_caller 		= "freebayes"
 }
 
 // NEXTFLOW REPORTING

diff --git a/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_0.fastq.gz b/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_0.fastq.gz
diff --git a/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_1.fastq.gz b/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_1.fastq.gz
diff --git a/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_2.fastq.gz b/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_2.fastq.gz
diff --git a/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_3.fastq.gz b/data/fastq_pass/barcode02/fastq_runid_b05e9ba03fee394fc76a041d3df7426ff48616c9_3.fastq.gz
diff --git a/data/info.txt b/data/info.txt
diff --git a/env/Dockerfile b/env/Dockerfile
@@ -18,8 +18,4 @@ RUN conda update -y conda && \
     conda clean --all
 
 WORKDIR "/opt"
-RUN wget https://github.com/seppinho/mutserve/releases/download/v2.0.0-rc15/mutserve.zip && \
-    unzip mutserve.zip
-ENV PATH="/opt/mutserve:${PATH}"
-
-
+RUN wget https://github.com/seppinho/mutserve/releases/download/v2.0.0-rc13.lpa/mutserve_LPA_adapted.jar
diff --git a/env/environment.yml b/env/environment.yml
@@ -11,7 +11,7 @@ dependencies:
   - seqtk=1.3
   - lofreq=2.1.5
   - freebayes=1.3.2
-  - vcflib
+  - vcflib=1.0.0
   - bedtools=2.30.0
   - vsearch=2.21.2
   - openjdk=11.0.9

diff --git a/lib/processes/cluster.nf b/lib/processes/cluster.nf
@@ -3,16 +3,17 @@ vsearch_dir="vsearch_clusters"
 
 process CLUSTER {
     publishDir "${params.output}/${sample}/clustering/${type}", pattern: "${consensus_fasta}", mode: 'copy'
+    publishDir "${params.output}/${sample}/clustering/${type}", pattern: "cluster*", mode: 'copy'
 
     input:
         tuple val( sample ), val( target ), path( detected_umis_fastq )
         val ( type )
     output:
-        tuple val( "${sample}" ), val( "${target}" ), path( "${consensus_fasta}" ), emit:consensus_fasta
-        tuple val( "${sample}" ), val( "${target}" ), path( "cluster*" ), emit:cluster_fastas
+        tuple val( "${sample}" ), val( "${target}" ), path( "${consensus_fasta}" ), optional: true, emit:consensus_fasta
+        tuple val( "${sample}" ), val( "${target}" ), path( "cluster*" ), optional: true, emit:cluster_fastas
 
     script:
-        def id = "${type}" == "raw" ? 0.8 : 0.99
+        def id = "${type}" == "raw" ? 0.90 : 0.99
     """
         vsearch \
         --clusterout_id \

diff --git a/lib/processes/variant_calling/freebayes.nf → lib/processes/freebayes.nf b/lib/processes/variant_calling/freebayes.nf → lib/processes/freebayes.nf
diff --git a/lib/processes/variant_calling/lofreq.nf → lib/processes/lofreq.nf b/lib/processes/variant_calling/lofreq.nf → lib/processes/lofreq.nf
@@ -16,6 +16,8 @@ process LOFREQ {
       --ref ${reference} \
       --out ${type}.vcf \
       --call-indels \
+      --min-cov 5 \
+      --no-default-filter \
       ${bam}
     """
 }
diff --git a/lib/processes/variant_calling/mutserve.nf → lib/processes/mutserve.nf b/lib/processes/variant_calling/mutserve.nf → lib/processes/mutserve.nf
@@ -14,7 +14,7 @@ process MUTSERVE {
 
     script:
     """
-      mutserve call  \
+      java -jar /opt/mutserve_LPA_adapted.jar call  \
       --output ${type}.vcf \
       --write-raw \
       --reference ${reference} \

diff --git a/lib/processes/reformat_filter_cluster.nf b/lib/processes/reformat_filter_cluster.nf
@@ -1,6 +1,6 @@
 process REFORMAT_FILTER_CLUSTER {
     tag "${sample}"
-    // publishDir "${params.output}/${sample}/clustering/${type}/smolecule", pattern: "smolecule*", mode: 'copy'
+    publishDir "${params.output}/${sample}/clustering/${type}/smolecule", pattern: "smolecule*", mode: 'copy'
     publishDir "${params.output}/${sample}/stats/${type}", pattern: "*tsv", mode: 'copy'
 
     input:

diff --git a/lib/workflows/umi-pipeline.nf b/lib/workflows/umi-pipeline.nf
@@ -51,9 +51,9 @@ include {REFORMAT_FILTER_CLUSTER} from '../processes/reformat_filter_cluster.nf'
 include {POLISH_CLUSTER} from '../processes/polish_cluster.nf'
 include {FILTER_CONSENSUS_FASTQ} from '../processes/filter_consensus_fastq.nf'
 include {REFORMAT_CONSENSUS_CLUSTER} from '../processes/reformat_consensus_cluster.nf'
-include {LOFREQ as LOFREQ_CONSENSUS; LOFREQ as LOFREQ_FINAL_CONSENSUS} from '../processes/variant_calling/lofreq.nf'
-include {MUTSERVE as MUTSERVE_CONSENSUS; MUTSERVE as MUTSERVE_FINAL_CONSENSUS} from '../processes/variant_calling/mutserve.nf'
-include {FREEBAYES as FREEBAYES_CONSENSUS; FREEBAYES as FREEBAYES_FINAL_CONSENSUS} from '../processes/variant_calling/freebayes.nf'
+include {LOFREQ as LOFREQ_CONSENSUS; LOFREQ as LOFREQ_FINAL_CONSENSUS} from '../processes/lofreq.nf'
+include {MUTSERVE as MUTSERVE_CONSENSUS; MUTSERVE as MUTSERVE_FINAL_CONSENSUS} from '../processes/mutserve.nf'
+include {FREEBAYES as FREEBAYES_CONSENSUS; FREEBAYES as FREEBAYES_FINAL_CONSENSUS} from '../processes/freebayes.nf'
 
 
 // SUB-WORKFLOWS