Merge pull request IARCbioinfo#2 from senkin/master

Adding readme and configs
Zemzemfiras1 · Dec 1, 2023 · 8d076f4 · 8d076f4
2 parents 539debf + 359bbfe
commit 8d076f4
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,73 @@
 # snpeff_annotation-nf
-Annotate VCF files with SnpEff and dbSnp
+## Nextflow DSL2 pipeline to annotate VCF files with SnpEff and dbSnp
+
+This repository contains a Nextflow DSL2 pipeline for annotating genetic variants in VCF files using SnpEff and dbSnp database. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file.
+
+## Prerequisites
+
+Make sure you have the following dependencies installed before running the pipeline:
+
+- [Nextflow](https://www.nextflow.io/)
+- [conda](https://conda.io/projects/conda/en/latest/index.html)
+- [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms)
+- [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/)
+
+## Pipeline Overview
+
+1. **FilterInputFiles:** Filters input VCF files using PLINK 2 to retain PASS variants with a maximum of 2 alleles.
+
+2. **AnnotateWithRSID:** Annotates variants with RSID using SnpSift and the dbSNP database.
+
+3. **AnnotateWithImpact:** Annotates variants with functional impact using snpEff and a specified reference genome.
+
+4. **FullyAnnotateWithDbSNP:** Performs comprehensive annotation using SnpSift and dbNSFP database, including information on gene impact, gnomAD data, REVEL scores, ClinVar information, and more.
+
+5. **ExtractFields:** Extracts relevant fields from the annotated VCF files and creates a tab-separated text file with a header for downstream analysis.
+
+## Usage
+
+1. Clone the repository:
+
+   ```bash
+   git clone https://github.com/IARCbioinfo/snpeff_annotation-nf
+   cd snpeff_annotation-nf
+   ```
+
+2. Adjust the `nextflow.config` file if necessary. The package versions are specified in `environment.yml` file.
+
+3. Run the pipeline with:
+
+   ```bash
+   nextflow run main.nf -profile conda
+   ```
+
+## Input
+
+| Name      | Default value | Description     |
+|-----------|---------------|-----------------|
+| `--input_folder_with_VCF_files`    |  `${baseDir}/VCFs/`  | Folder containing `*vcf.gz` files |
+
+
+## Parameters
+
+  * #### Optional
+
+| Name      | Default value | Description     |
+|-----------|---------------|-----------------|
+| `--reference_genome`    |  `GRCh37.75`  | Reference genome |
+| `--dbNSF_path`     |  `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) |
+| `--dbSNP_path`    |  `${baseDir}/dbsnp150.vcf.gz`  |    [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) |
+| `--output_path`    |  `${baseDir}/output` |  Output folder |
+
+## Output
+
+The final annotated and extracted information will be available in the output directory as `full_annotation.txt`.
+
+## Customization
+
+- Adjust the memory requirements etc in the `nextflow.config` file.
+- Customize the annotation processes in the `main.nf` script based on your specific requirements.
+
+## Acknowledgments
+
+- This pipeline utilizes various bioinformatics tools and databases, including PLINK, bcftools, SnpSift, snpEff, dbNSFP, and dbSNP.
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,11 @@
+name: annotation-pipeline
+channels:
+  - bioconda
+  - defaults
+  - conda-forge
+dependencies:
+  - bcftools=1.9
+  - plink2=2.00a2.3
+  - snpeff=5.0-0
+  - snpsift=5.1
+  - py-bgzip=0.4.0
diff --git a/main.nf b/main.nf
@@ -1,43 +1,11 @@
 #!/usr/bin/env nextflow
 
-params.input_folder_with_VCF_files = "${baseDir}/VCFs/*vcf.gz"
+params.input_folder_with_VCF_files = "${baseDir}/VCFs/"
 params.reference_genome = "GRCh37.75"
 params.dbNSF_path = "${baseDir}/dbNSFP4.1a.txt.gz"
 params.dbSNP_path = "${baseDir}/dbsnp150.vcf.gz"
 params.output_path = "${baseDir}/output"
 
-// process DownloadDbNSF {
-//     output:
-//     file dbNSF "./dbNSFP4.1a.txt.gz"
-//     file dbNSFIndex "./dbNSFP4.1a.txt.gz.tbi"
-
-//     script:
-//     """
-//     if [ ! -f ${params.dbNSF_path} ]; then
-//         echo "dbNSF database not found, downloading..."
-//         wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz
-//         wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz.tbi
-//     fi
-//     """
-// }
-
-// process DownloadDbSNP {
-//     output:
-//     file dbSNP "./dbsnp150.vcf.gz"
-//     file dbSNPIndex "./dbsnp150.vcf.gz.tbi"
-
-//     script:
-//     """
-//     if [ ! -f ${params.dbSNP_path} ]; then
-//         echo "dbSNP database not found, downloading..."
-//         wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz
-//         wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz.tbi
-//         mv 00-All.vcf.gz dbsnp150.vcf.gz
-//         mv 00-All.vcf.gz.tbi dbsnp150.vcf.gz.tbi
-//     fi
-//     """
-// }
-
 process FilterInputFiles {
     tag "Sample ${sample}"
 
@@ -195,10 +163,8 @@ process ExtractFields {
 }
 
 workflow {
-    // DownloadDbNSF //TODO - download databases if not available
-    // DownloadDbSNP
     // Grab input VCF files
-    file_channel = Channel.fromPath( params.input_folder_with_VCF_files, checkIfExists: true )
+    file_channel = Channel.fromPath( params.input_folder_with_VCF_files + '/*vcf.gz', checkIfExists: true )
     // Launch the pipeline
     FilterInputFiles(file_channel) \
         | AnnotateWithRSID \

diff --git a/nextflow.config b/nextflow.config
@@ -0,0 +1,41 @@
+conda.enabled = true
+conda.createTimeout = '3 h'
+
+profiles {
+  conda {
+	process.conda = "$baseDir/environment.yml"
+  }
+}
+
+process {
+    shell =  ['/bin/bash','-o','pipefail']
+    withLabel: big_mem {
+        memory = 16.GB
+    }
+}
+
+params.output_path = "${baseDir}/output"
+
+timeline {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_timeline.html"
+}
+
+report {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_report.html"
+}
+
+trace {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_trace.txt"
+}
+
+dag {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_dag.html"
+}