From d8cc901305d876624255c00d6fd1dbf3bafe3553 Mon Sep 17 00:00:00 2001
From: Sergey Senkin <s.senkin@gmail.com>
Date: Fri, 1 Dec 2023 15:42:54 +0100
Subject: [PATCH 1/4] Adding readme and configs

---
 README.md       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-
 environment.yml | 11 +++++++++
 nextflow.config | 41 ++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 environment.yml
 create mode 100644 nextflow.config

diff --git a/README.md b/README.md
index f8e6591..c6fe168 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,59 @@
 # snpeff_annotation-nf
-Annotate VCF files with SnpEff and dbSnp
+## Nextflow pipeline to annotate VCF files with SnpEff and dbSnp
+
+This repository contains a Nextflow pipeline for annotating genetic variants in VCF (Variant Call Format) files. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file.
+
+## Prerequisites
+
+Make sure you have the following dependencies installed before running the pipeline:
+
+- [Nextflow](https://www.nextflow.io/)
+- [conda](https://conda.io/projects/conda/en/latest/index.html)
+- [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms)
+- [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/)
+
+## Pipeline Overview
+
+1. **FilterInputFiles:** Filters input VCF files using PLINK 2 to retain PASS variants with a maximum of 2 alleles.
+
+2. **AnnotateWithRSID:** Annotates variants with RSID using SnpSift and the dbSNP database.
+
+3. **AnnotateWithImpact:** Annotates variants with functional impact using snpEff and a specified reference genome.
+
+4. **FullyAnnotateWithDbSNP:** Performs comprehensive annotation using SnpSift and dbNSFP database, including information on gene impact, gnomAD data, REVEL scores, ClinVar information, and more.
+
+5. **ExtractFields:** Extracts relevant fields from the annotated VCF files and creates a tab-separated text file with a header for downstream analysis.
+
+## Usage
+
+1. Clone the repository:
+
+   ```bash
+   git clone https://github.com/IARCbioinfo/snpeff_annotation-nf
+   cd snpeff_annotation-nf
+   ```
+
+2. Adjust the `nextflow.config` file if necessary.
+
+3. Run the pipeline with:
+
+   ```bash
+   nextflow run main.nf -profile conda
+   ```
+
+## Input
+
+The directory with input VCF files specified by the `params.input_folder_with_VCF_files` parameter (by default, `./VCFs/*vcf.gz`)
+
+## Output
+
+The final annotated and extracted information will be available in the `output` directory as `full_annotation.txt`.
+
+## Customization
+
+- Adjust the memory requirements etc in the `nextflow.config` file.
+- Customize the annotation processes in the `main.nf` script based on your specific requirements.
+
+## Acknowledgments
+
+- This pipeline utilizes various bioinformatics tools and databases, including PLINK, bcftools, SnpSift, snpEff, dbNSFP, and dbSNP.
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..7b9688d
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,11 @@
+name: annotation-pipeline
+channels:
+  - bioconda
+  - defaults
+  - conda-forge
+dependencies:
+  - bcftools=1.9
+  - plink2=2.00a2.3
+  - snpeff=5.0-0
+  - snpsift=5.1
+  - py-bgzip=0.4.0
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..4f57476
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,41 @@
+conda.enabled = true
+conda.createTimeout = '3 h'
+
+profiles {
+  conda {
+	process.conda = "$baseDir/environment.yml"
+  }
+}
+
+process {
+    shell =  ['/bin/bash','-o','pipefail']
+    withLabel: big_mem {
+        memory = 16.GB
+    }
+}
+
+params.output_path = "${baseDir}/output"
+
+timeline {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_timeline.html"
+}
+
+report {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_report.html"
+}
+
+trace {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_trace.txt"
+}
+
+dag {
+  enabled = true
+  overwrite = true
+  file = "${params.output_path}/nf-pipeline_info/annotation-nf_dag.html"
+}

From 83589751310902bbd1ccc030ccf505bd99f0e83c Mon Sep 17 00:00:00 2001
From: Sergey Senkin <s.senkin@gmail.com>
Date: Fri, 1 Dec 2023 15:57:07 +0100
Subject: [PATCH 2/4] Correcting README

---
 README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c6fe168..7e1b5d0 100644
--- a/README.md
+++ b/README.md
@@ -43,11 +43,25 @@ Make sure you have the following dependencies installed before running the pipel
 
 ## Input
 
-The directory with input VCF files specified by the `params.input_folder_with_VCF_files` parameter (by default, `./VCFs/*vcf.gz`)
+| Name      | Default value | Description     |
+|-----------|---------------|-----------------|
+| `--output_foinput_folder_with_VCF_fileslder`    |  `${baseDir}/VCFs/`  | Folder containing `*vcf.gz` files |
+ 
+
+## Parameters
+
+  * #### Optional
+
+| Name      | Default value | Description     |
+|-----------|---------------|-----------------|
+| `--reference_genome`    |  `GRCh37.75`  | Folder containing `*vcf.gz` files |
+| `--dbNSF_path`     |  `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) |
+| `--dbSNP_path`    |  `${baseDir}/dbsnp150.vcf.gz`  |    [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) |
+| `--output_path`    |  `${baseDir}/output` |  Output folder |
 
 ## Output
 
-The final annotated and extracted information will be available in the `output` directory as `full_annotation.txt`.
+The final annotated and extracted information will be available in the output directory as `full_annotation.txt`.
 
 ## Customization
 

From 05b1e73b2f36161464dde370aadb1d1d22be30fa Mon Sep 17 00:00:00 2001
From: Sergey Senkin <s.senkin@gmail.com>
Date: Fri, 1 Dec 2023 15:59:03 +0100
Subject: [PATCH 3/4] Correcting README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7e1b5d0..5de7fff 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ Make sure you have the following dependencies installed before running the pipel
 
 | Name      | Default value | Description     |
 |-----------|---------------|-----------------|
-| `--output_foinput_folder_with_VCF_fileslder`    |  `${baseDir}/VCFs/`  | Folder containing `*vcf.gz` files |
+| `--input_folder_with_VCF_files`    |  `${baseDir}/VCFs/`  | Folder containing `*vcf.gz` files |
  
 
 ## Parameters
@@ -54,7 +54,7 @@ Make sure you have the following dependencies installed before running the pipel
 
 | Name      | Default value | Description     |
 |-----------|---------------|-----------------|
-| `--reference_genome`    |  `GRCh37.75`  | Folder containing `*vcf.gz` files |
+| `--reference_genome`    |  `GRCh37.75`  | Reference genome |
 | `--dbNSF_path`     |  `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) |
 | `--dbSNP_path`    |  `${baseDir}/dbsnp150.vcf.gz`  |    [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) |
 | `--output_path`    |  `${baseDir}/output` |  Output folder |

From 359bbfe267dffe490d3a1ee169c63dce16f04ae9 Mon Sep 17 00:00:00 2001
From: Sergey Senkin <s.senkin@gmail.com>
Date: Fri, 1 Dec 2023 16:11:01 +0100
Subject: [PATCH 4/4] Cleaning up

---
 README.md |  6 +++---
 main.nf   | 38 ++------------------------------------
 2 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 5de7fff..6b686ed 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # snpeff_annotation-nf
-## Nextflow pipeline to annotate VCF files with SnpEff and dbSnp
+## Nextflow DSL2 pipeline to annotate VCF files with SnpEff and dbSnp
 
-This repository contains a Nextflow pipeline for annotating genetic variants in VCF (Variant Call Format) files. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file.
+This repository contains a Nextflow DSL2 pipeline for annotating genetic variants in VCF files using SnpEff and dbSnp database. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file.
 
 ## Prerequisites
 
@@ -33,7 +33,7 @@ Make sure you have the following dependencies installed before running the pipel
    cd snpeff_annotation-nf
    ```
 
-2. Adjust the `nextflow.config` file if necessary.
+2. Adjust the `nextflow.config` file if necessary. The package versions are specified in `environment.yml` file.
 
 3. Run the pipeline with:
 
diff --git a/main.nf b/main.nf
index 3dd3b51..977bbff 100755
--- a/main.nf
+++ b/main.nf
@@ -1,43 +1,11 @@
 #!/usr/bin/env nextflow
 
-params.input_folder_with_VCF_files = "${baseDir}/VCFs/*vcf.gz"
+params.input_folder_with_VCF_files = "${baseDir}/VCFs/"
 params.reference_genome = "GRCh37.75"
 params.dbNSF_path = "${baseDir}/dbNSFP4.1a.txt.gz"
 params.dbSNP_path = "${baseDir}/dbsnp150.vcf.gz"
 params.output_path = "${baseDir}/output"
 
-// process DownloadDbNSF {
-//     output:
-//     file dbNSF "./dbNSFP4.1a.txt.gz"
-//     file dbNSFIndex "./dbNSFP4.1a.txt.gz.tbi"
-
-//     script:
-//     """
-//     if [ ! -f ${params.dbNSF_path} ]; then
-//         echo "dbNSF database not found, downloading..."
-//         wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz
-//         wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz.tbi
-//     fi
-//     """
-// }
-
-// process DownloadDbSNP {
-//     output:
-//     file dbSNP "./dbsnp150.vcf.gz"
-//     file dbSNPIndex "./dbsnp150.vcf.gz.tbi"
-
-//     script:
-//     """
-//     if [ ! -f ${params.dbSNP_path} ]; then
-//         echo "dbSNP database not found, downloading..."
-//         wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz
-//         wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz.tbi
-//         mv 00-All.vcf.gz dbsnp150.vcf.gz
-//         mv 00-All.vcf.gz.tbi dbsnp150.vcf.gz.tbi
-//     fi
-//     """
-// }
-
 process FilterInputFiles {
     tag "Sample ${sample}"
 
@@ -195,10 +163,8 @@ process ExtractFields {
 }
 
 workflow {
-    // DownloadDbNSF //TODO - download databases if not available
-    // DownloadDbSNP
     // Grab input VCF files
-    file_channel = Channel.fromPath( params.input_folder_with_VCF_files, checkIfExists: true )
+    file_channel = Channel.fromPath( params.input_folder_with_VCF_files + '/*vcf.gz', checkIfExists: true )
     // Launch the pipeline
     FilterInputFiles(file_channel) \
         | AnnotateWithRSID \