diff --git a/README.md b/README.md index f8e6591..6b686ed 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,73 @@ # snpeff_annotation-nf -Annotate VCF files with SnpEff and dbSnp +## Nextflow DSL2 pipeline to annotate VCF files with SnpEff and dbSnp + +This repository contains a Nextflow DSL2 pipeline for annotating genetic variants in VCF files using SnpEff and dbSnp database. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file. + +## Prerequisites + +Make sure you have the following dependencies installed before running the pipeline: + +- [Nextflow](https://www.nextflow.io/) +- [conda](https://conda.io/projects/conda/en/latest/index.html) +- [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) +- [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) + +## Pipeline Overview + +1. **FilterInputFiles:** Filters input VCF files using PLINK 2 to retain PASS variants with a maximum of 2 alleles. + +2. **AnnotateWithRSID:** Annotates variants with RSID using SnpSift and the dbSNP database. + +3. **AnnotateWithImpact:** Annotates variants with functional impact using snpEff and a specified reference genome. + +4. **FullyAnnotateWithDbSNP:** Performs comprehensive annotation using SnpSift and dbNSFP database, including information on gene impact, gnomAD data, REVEL scores, ClinVar information, and more. + +5. **ExtractFields:** Extracts relevant fields from the annotated VCF files and creates a tab-separated text file with a header for downstream analysis. + +## Usage + +1. Clone the repository: + + ```bash + git clone https://github.com/IARCbioinfo/snpeff_annotation-nf + cd snpeff_annotation-nf + ``` + +2. Adjust the `nextflow.config` file if necessary. The package versions are specified in `environment.yml` file. + +3. Run the pipeline with: + + ```bash + nextflow run main.nf -profile conda + ``` + +## Input + +| Name | Default value | Description | +|-----------|---------------|-----------------| +| `--input_folder_with_VCF_files` | `${baseDir}/VCFs/` | Folder containing `*vcf.gz` files | + + +## Parameters + + * #### Optional + +| Name | Default value | Description | +|-----------|---------------|-----------------| +| `--reference_genome` | `GRCh37.75` | Reference genome | +| `--dbNSF_path` | `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) | +| `--dbSNP_path` | `${baseDir}/dbsnp150.vcf.gz` | [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) | +| `--output_path` | `${baseDir}/output` | Output folder | + +## Output + +The final annotated and extracted information will be available in the output directory as `full_annotation.txt`. + +## Customization + +- Adjust the memory requirements etc in the `nextflow.config` file. +- Customize the annotation processes in the `main.nf` script based on your specific requirements. + +## Acknowledgments + +- This pipeline utilizes various bioinformatics tools and databases, including PLINK, bcftools, SnpSift, snpEff, dbNSFP, and dbSNP. diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..7b9688d --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +name: annotation-pipeline +channels: + - bioconda + - defaults + - conda-forge +dependencies: + - bcftools=1.9 + - plink2=2.00a2.3 + - snpeff=5.0-0 + - snpsift=5.1 + - py-bgzip=0.4.0 \ No newline at end of file diff --git a/main.nf b/main.nf index 3dd3b51..977bbff 100755 --- a/main.nf +++ b/main.nf @@ -1,43 +1,11 @@ #!/usr/bin/env nextflow -params.input_folder_with_VCF_files = "${baseDir}/VCFs/*vcf.gz" +params.input_folder_with_VCF_files = "${baseDir}/VCFs/" params.reference_genome = "GRCh37.75" params.dbNSF_path = "${baseDir}/dbNSFP4.1a.txt.gz" params.dbSNP_path = "${baseDir}/dbsnp150.vcf.gz" params.output_path = "${baseDir}/output" -// process DownloadDbNSF { -// output: -// file dbNSF "./dbNSFP4.1a.txt.gz" -// file dbNSFIndex "./dbNSFP4.1a.txt.gz.tbi" - -// script: -// """ -// if [ ! -f ${params.dbNSF_path} ]; then -// echo "dbNSF database not found, downloading..." -// wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz -// wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz.tbi -// fi -// """ -// } - -// process DownloadDbSNP { -// output: -// file dbSNP "./dbsnp150.vcf.gz" -// file dbSNPIndex "./dbsnp150.vcf.gz.tbi" - -// script: -// """ -// if [ ! -f ${params.dbSNP_path} ]; then -// echo "dbSNP database not found, downloading..." -// wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz -// wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz.tbi -// mv 00-All.vcf.gz dbsnp150.vcf.gz -// mv 00-All.vcf.gz.tbi dbsnp150.vcf.gz.tbi -// fi -// """ -// } - process FilterInputFiles { tag "Sample ${sample}" @@ -195,10 +163,8 @@ process ExtractFields { } workflow { - // DownloadDbNSF //TODO - download databases if not available - // DownloadDbSNP // Grab input VCF files - file_channel = Channel.fromPath( params.input_folder_with_VCF_files, checkIfExists: true ) + file_channel = Channel.fromPath( params.input_folder_with_VCF_files + '/*vcf.gz', checkIfExists: true ) // Launch the pipeline FilterInputFiles(file_channel) \ | AnnotateWithRSID \ diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..4f57476 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,41 @@ +conda.enabled = true +conda.createTimeout = '3 h' + +profiles { + conda { + process.conda = "$baseDir/environment.yml" + } +} + +process { + shell = ['/bin/bash','-o','pipefail'] + withLabel: big_mem { + memory = 16.GB + } +} + +params.output_path = "${baseDir}/output" + +timeline { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_timeline.html" +} + +report { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_report.html" +} + +trace { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_trace.txt" +} + +dag { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_dag.html" +}