From d8cc901305d876624255c00d6fd1dbf3bafe3553 Mon Sep 17 00:00:00 2001 From: Sergey Senkin Date: Fri, 1 Dec 2023 15:42:54 +0100 Subject: [PATCH 1/4] Adding readme and configs --- README.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++++- environment.yml | 11 +++++++++ nextflow.config | 41 ++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 environment.yml create mode 100644 nextflow.config diff --git a/README.md b/README.md index f8e6591..c6fe168 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,59 @@ # snpeff_annotation-nf -Annotate VCF files with SnpEff and dbSnp +## Nextflow pipeline to annotate VCF files with SnpEff and dbSnp + +This repository contains a Nextflow pipeline for annotating genetic variants in VCF (Variant Call Format) files. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file. + +## Prerequisites + +Make sure you have the following dependencies installed before running the pipeline: + +- [Nextflow](https://www.nextflow.io/) +- [conda](https://conda.io/projects/conda/en/latest/index.html) +- [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) +- [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) + +## Pipeline Overview + +1. **FilterInputFiles:** Filters input VCF files using PLINK 2 to retain PASS variants with a maximum of 2 alleles. + +2. **AnnotateWithRSID:** Annotates variants with RSID using SnpSift and the dbSNP database. + +3. **AnnotateWithImpact:** Annotates variants with functional impact using snpEff and a specified reference genome. + +4. **FullyAnnotateWithDbSNP:** Performs comprehensive annotation using SnpSift and dbNSFP database, including information on gene impact, gnomAD data, REVEL scores, ClinVar information, and more. + +5. **ExtractFields:** Extracts relevant fields from the annotated VCF files and creates a tab-separated text file with a header for downstream analysis. + +## Usage + +1. Clone the repository: + + ```bash + git clone https://github.com/IARCbioinfo/snpeff_annotation-nf + cd snpeff_annotation-nf + ``` + +2. Adjust the `nextflow.config` file if necessary. + +3. Run the pipeline with: + + ```bash + nextflow run main.nf -profile conda + ``` + +## Input + +The directory with input VCF files specified by the `params.input_folder_with_VCF_files` parameter (by default, `./VCFs/*vcf.gz`) + +## Output + +The final annotated and extracted information will be available in the `output` directory as `full_annotation.txt`. + +## Customization + +- Adjust the memory requirements etc in the `nextflow.config` file. +- Customize the annotation processes in the `main.nf` script based on your specific requirements. + +## Acknowledgments + +- This pipeline utilizes various bioinformatics tools and databases, including PLINK, bcftools, SnpSift, snpEff, dbNSFP, and dbSNP. diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..7b9688d --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +name: annotation-pipeline +channels: + - bioconda + - defaults + - conda-forge +dependencies: + - bcftools=1.9 + - plink2=2.00a2.3 + - snpeff=5.0-0 + - snpsift=5.1 + - py-bgzip=0.4.0 \ No newline at end of file diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..4f57476 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,41 @@ +conda.enabled = true +conda.createTimeout = '3 h' + +profiles { + conda { + process.conda = "$baseDir/environment.yml" + } +} + +process { + shell = ['/bin/bash','-o','pipefail'] + withLabel: big_mem { + memory = 16.GB + } +} + +params.output_path = "${baseDir}/output" + +timeline { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_timeline.html" +} + +report { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_report.html" +} + +trace { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_trace.txt" +} + +dag { + enabled = true + overwrite = true + file = "${params.output_path}/nf-pipeline_info/annotation-nf_dag.html" +} From 83589751310902bbd1ccc030ccf505bd99f0e83c Mon Sep 17 00:00:00 2001 From: Sergey Senkin Date: Fri, 1 Dec 2023 15:57:07 +0100 Subject: [PATCH 2/4] Correcting README --- README.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c6fe168..7e1b5d0 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,25 @@ Make sure you have the following dependencies installed before running the pipel ## Input -The directory with input VCF files specified by the `params.input_folder_with_VCF_files` parameter (by default, `./VCFs/*vcf.gz`) +| Name | Default value | Description | +|-----------|---------------|-----------------| +| `--output_foinput_folder_with_VCF_fileslder` | `${baseDir}/VCFs/` | Folder containing `*vcf.gz` files | + + +## Parameters + + * #### Optional + +| Name | Default value | Description | +|-----------|---------------|-----------------| +| `--reference_genome` | `GRCh37.75` | Folder containing `*vcf.gz` files | +| `--dbNSF_path` | `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) | +| `--dbSNP_path` | `${baseDir}/dbsnp150.vcf.gz` | [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) | +| `--output_path` | `${baseDir}/output` | Output folder | ## Output -The final annotated and extracted information will be available in the `output` directory as `full_annotation.txt`. +The final annotated and extracted information will be available in the output directory as `full_annotation.txt`. ## Customization From 05b1e73b2f36161464dde370aadb1d1d22be30fa Mon Sep 17 00:00:00 2001 From: Sergey Senkin Date: Fri, 1 Dec 2023 15:59:03 +0100 Subject: [PATCH 3/4] Correcting README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7e1b5d0..5de7fff 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Make sure you have the following dependencies installed before running the pipel | Name | Default value | Description | |-----------|---------------|-----------------| -| `--output_foinput_folder_with_VCF_fileslder` | `${baseDir}/VCFs/` | Folder containing `*vcf.gz` files | +| `--input_folder_with_VCF_files` | `${baseDir}/VCFs/` | Folder containing `*vcf.gz` files | ## Parameters @@ -54,7 +54,7 @@ Make sure you have the following dependencies installed before running the pipel | Name | Default value | Description | |-----------|---------------|-----------------| -| `--reference_genome` | `GRCh37.75` | Folder containing `*vcf.gz` files | +| `--reference_genome` | `GRCh37.75` | Reference genome | | `--dbNSF_path` | `${baseDir}/dbNSFP4.1a.txt.gz` | [dbNSFP database](https://pcingola.github.io/SnpEff/ss_dbnsfp/) | | `--dbSNP_path` | `${baseDir}/dbsnp150.vcf.gz` | [dbSNP database](https://ftp.ncbi.nlm.nih.gov/snp/organisms) | | `--output_path` | `${baseDir}/output` | Output folder | From 359bbfe267dffe490d3a1ee169c63dce16f04ae9 Mon Sep 17 00:00:00 2001 From: Sergey Senkin Date: Fri, 1 Dec 2023 16:11:01 +0100 Subject: [PATCH 4/4] Cleaning up --- README.md | 6 +++--- main.nf | 38 ++------------------------------------ 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 5de7fff..6b686ed 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # snpeff_annotation-nf -## Nextflow pipeline to annotate VCF files with SnpEff and dbSnp +## Nextflow DSL2 pipeline to annotate VCF files with SnpEff and dbSnp -This repository contains a Nextflow pipeline for annotating genetic variants in VCF (Variant Call Format) files. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file. +This repository contains a Nextflow DSL2 pipeline for annotating genetic variants in VCF files using SnpEff and dbSnp database. The pipeline processes input VCF files, performs various annotations, and generates a comprehensive annotation file. ## Prerequisites @@ -33,7 +33,7 @@ Make sure you have the following dependencies installed before running the pipel cd snpeff_annotation-nf ``` -2. Adjust the `nextflow.config` file if necessary. +2. Adjust the `nextflow.config` file if necessary. The package versions are specified in `environment.yml` file. 3. Run the pipeline with: diff --git a/main.nf b/main.nf index 3dd3b51..977bbff 100755 --- a/main.nf +++ b/main.nf @@ -1,43 +1,11 @@ #!/usr/bin/env nextflow -params.input_folder_with_VCF_files = "${baseDir}/VCFs/*vcf.gz" +params.input_folder_with_VCF_files = "${baseDir}/VCFs/" params.reference_genome = "GRCh37.75" params.dbNSF_path = "${baseDir}/dbNSFP4.1a.txt.gz" params.dbSNP_path = "${baseDir}/dbsnp150.vcf.gz" params.output_path = "${baseDir}/output" -// process DownloadDbNSF { -// output: -// file dbNSF "./dbNSFP4.1a.txt.gz" -// file dbNSFIndex "./dbNSFP4.1a.txt.gz.tbi" - -// script: -// """ -// if [ ! -f ${params.dbNSF_path} ]; then -// echo "dbNSF database not found, downloading..." -// wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz -// wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz.tbi -// fi -// """ -// } - -// process DownloadDbSNP { -// output: -// file dbSNP "./dbsnp150.vcf.gz" -// file dbSNPIndex "./dbsnp150.vcf.gz.tbi" - -// script: -// """ -// if [ ! -f ${params.dbSNP_path} ]; then -// echo "dbSNP database not found, downloading..." -// wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz -// wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz.tbi -// mv 00-All.vcf.gz dbsnp150.vcf.gz -// mv 00-All.vcf.gz.tbi dbsnp150.vcf.gz.tbi -// fi -// """ -// } - process FilterInputFiles { tag "Sample ${sample}" @@ -195,10 +163,8 @@ process ExtractFields { } workflow { - // DownloadDbNSF //TODO - download databases if not available - // DownloadDbSNP // Grab input VCF files - file_channel = Channel.fromPath( params.input_folder_with_VCF_files, checkIfExists: true ) + file_channel = Channel.fromPath( params.input_folder_with_VCF_files + '/*vcf.gz', checkIfExists: true ) // Launch the pipeline FilterInputFiles(file_channel) \ | AnnotateWithRSID \